Exemple #1
0
def run_model(model, dataset, run, runf, desc='valid'):
    from time import time
    # BATCH_SIZE = 16
    rerank_run = {}
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()),
                               ncols=80,
                               desc=desc,
                               leave=False) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run,
                                               BATCH_SIZE):
            scores = model(records['query_tok'], records['query_mask'],
                           records['doc_tok'], records['doc_mask'])
            for qid, did, score in zip(records['query_id'], records['doc_id'],
                                       scores.detach().cpu().numpy()):
                rerank_run.setdefault(qid, {})[did] = score.item()
            pbar.update(len(records['query_id']))

    with open(runf, 'wt') as runfile:
        for qid in rerank_run:
            scores = list(
                sorted(rerank_run[qid].items(),
                       key=lambda x: (x[1], x[0]),
                       reverse=True))
            for i, (did, score) in enumerate(scores):
                runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
Exemple #2
0
def run_model(model, dataset, run, runf, contentid2entity, embed):
    BATCH_SIZE = 16
    #BATCH_SIZE = 8
    rerank_run = {}
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()),
                               ncols=80,
                               desc='valid',
                               leave=False) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE,
                                               contentid2entity):
            query_entity = embed(records['query_entity'].cpu() + 1).cuda()
            doc_entity = embed(records['doc_entity'].cpu() + 1).cuda()
            scores = model(records['query_tok'], records['query_mask'],
                           records['doc_tok'], records['doc_mask'],
                           query_entity, doc_entity)
            for qid, did, score in zip(records['query_id'], records['doc_id'],
                                       scores):
                rerank_run.setdefault(qid, {})[did] = score.item()
            pbar.update(len(records['query_id']))
    with open(runf, 'wt') as runfile:
        for qid in rerank_run:
            scores = list(
                sorted(rerank_run[qid].items(),
                       key=lambda x: (x[1], x[0]),
                       reverse=True))
            for i, (did, score) in enumerate(scores):
                runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
Exemple #3
0
def run_model(model, dataset, run, runf, desc='valid'):
    BATCH_SIZE = 16
    rerank_run = {}
    model_name = type(model).__name__
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()),
                               ncols=80,
                               desc=desc,
                               leave=False) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run,
                                               BATCH_SIZE):
            if (model_name.startswith("Duet")):
                scores, v_scores, c_scores = model(records['query_tok'],
                                                   records['query_mask'],
                                                   records['doc_tok'],
                                                   records['doc_mask'])
                #scores = v_scores + c_scores
            else:
                scores = model(records['query_tok'], records['query_mask'],
                               records['doc_tok'], records['doc_mask'])
            for qid, did, score in zip(records['query_id'], records['doc_id'],
                                       scores):
                rerank_run.setdefault(qid, {})[did] = score.item()
            pbar.update(len(records['query_id']))
    with open(runf, 'wt') as runfile:
        for qid in rerank_run:
            scores = list(
                sorted(rerank_run[qid].items(),
                       key=lambda x: (x[1], x[0]),
                       reverse=True))
            for i, (did, score) in enumerate(scores):
                runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
Exemple #4
0
def run_model(model, dataset, run, desc='valid'):
    rerank_run = defaultdict(dict)
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE):
            scores = model(records['query_tok'],
                           records['query_mask'],
                           records['doc_tok'],
                           records['doc_mask'])
            for qid, did, score in zip(records['query_id'], records['doc_id'], scores):
                rerank_run[qid][did] = score.item()
            pbar.update(len(records['query_id']))
    return rerank_run
Exemple #5
0
def interpret_model(model, dataset, run, runf, outdir, layer_n, desc='valid'):
    target_qid = "303"
    target_did = ["FT944-128", "FT934-5418"]  ## true, false
    BATCH_SIZE = 1  ## should be 1!!
    cnt = 0
    runfile = open(runf, "w")
    with tqdm(total=sum(len(r) for r in run.values()),
              ncols=80,
              desc=desc,
              leave=False) as pbar:
        model.eval()
        out = {}
        for records in data.iter_valid_records(model, dataset, run,
                                               BATCH_SIZE):
            scores, grads = model.grad_forward(records['query_tok'],
                                               records['query_mask'],
                                               records['doc_tok'],
                                               records['doc_mask'],
                                               layer_n=layer_n)
            qid = records['query_id'][0]
            did = records['doc_id'][0]
            score = scores[0, 0].item()

            grad = torch.mean(grads, dim=0)
            grad_sumabs = torch.sum(torch.abs(grad), dim=1)
            cls_sum = grad_sumabs[0]
            query_sum = torch.sum(grad_sumabs[1:21])
            document_sum = torch.sum(grad_sumabs[22:-1])
            query_avg = torch.mean(grad_sumabs[1:21])
            document_avg = torch.mean(grad_sumabs[22:-1])
            ratio = query_sum.item() / document_sum.item()
            print(qid,
                  did,
                  score,
                  ratio,
                  cls_sum.item(),
                  query_sum.item(),
                  document_sum.item(),
                  file=runfile)

            if (qid == target_qid and did in target_did):
                print("found!!")
                qtk = model.tokenizer.tokenize(dataset[0].get(qid))
                dtk = model.tokenizer.tokenize(dataset[1].get(did))
                out['qtok'] = qtk
                out['dtok'] = dtk
                out['attention'] = scores
                out['grad'] = grad
                torch.save(out, "./models/" + outdir + "/" + qid + did + ".pt")
            pbar.update(len(records['query_id']))
            cnt += 1
Exemple #6
0
def score_model(model, dataset, run, passageAgg, desc='valid'):
    BATCH_SIZE = 16
    passageAgg
    rerank_run = defaultdict(lambda: defaultdict(float))
    #a defauldict where the default values are defaultdicts, whose default values are 0, qid->did->score
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()),
                               ncols=80,
                               desc=desc,
                               leave=False) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run,
                                               BATCH_SIZE):

            scores = model(records['query_tok'], records['query_mask'],
                           records['doc_tok'], records['doc_mask'])
            if passageAgg == 'first':
                for qid, pid, score in zip(records['query_id'],
                                           records['doc_id'], scores):
                    did = pid.split("%p")[0]
                    if not did in rerank_run[qid]:
                        rerank_run[qid][did] = score.item()
            elif passageAgg == 'sum':
                for qid, pid, score in zip(records['query_id'],
                                           records['doc_id'], scores):
                    did = pid.split("%p")[0]
                    rerank_run[qid][did] += score.item()
            elif passageAgg == 'max':
                for qid, pid, score in zip(records['query_id'],
                                           records['doc_id'], scores):
                    did = pid.split("%p")[0]
                    #print("%s %s %f" % (qid, did, score.item()))
                    #should be 0 if the document hasnt been seen before
                    if score.item() > rerank_run[qid][did]:
                        rerank_run[qid][did] = score.item()
            pbar.update(len(records['query_id']))
    #print(rerank_run[64527]["D414820"])
    return rerank_run
Exemple #7
0
    def computeScoresFromRawOverride(self, query, docs):
        if self.debugPrint:
            print('getScores', query.id, query.text)

        queryData = {query.id: query.text}
        # Run maps queries to arrays of document IDs see iter_valid_records (train.py)
        run = {query.id: [e.id for e in docs]}

        docData = {}
        for e in docs:
            docData[e.id] = e.text

        sampleRet = {}

        if docData:

            # based on the code from run_model function (train.py)
            dataSet = queryData, docData
            for records in data.iter_valid_records(self.model, dataSet, run,
                                                   self.batchSize):
                scores = self.model(records['query_tok'],
                                    records['query_mask'], records['doc_tok'],
                                    records['doc_mask'])
                for qid, did, score in zip(records['query_id'],
                                           records['doc_id'], scores):
                    score = score.item()  # From tensor to value
                    if self.debugPrint:
                        print(score, did, docData[did])
                    # Note that each element must be an array, b/c
                    # we can generate more than one feature per document!
                    sampleRet[did] = [score]

        if self.debugPrint:
            print('All scores:', sampleRet)

        return sampleRet
Exemple #8
0
def run_model(model, dataset, run, desc='valid'):
    rerank_run = defaultdict(dict)
    true_id = model.tokenizer.get_vocab()[model.tokenizer.tokenize("true")[0]]
    false_id = model.tokenizer.get_vocab()[model.tokenizer.tokenize("false")
                                           [0]]
    with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()),
                               ncols=80,
                               desc=desc) as pbar:
        model.eval()
        for records in data.iter_valid_records(model, dataset, run,
                                               BATCH_SIZE):
            logits = model.generate(records['query_tok'],
                                    records['query_mask'], records['doc_tok'],
                                    records['doc_mask'])
            # scores = logits.softmax(dim=-1)[:, true_id]
            true_logits = logits[:, true_id].unsqueeze(dim=-1)
            false_logits = logits[:, false_id].unsqueeze(dim=-1)
            tf_logits = torch.cat((true_logits, false_logits), dim=-1)
            scores = tf_logits.softmax(dim=-1)[:, 0]
            for qid, did, score in zip(records['query_id'], records['doc_id'],
                                       scores):
                rerank_run[qid][did] = score.item()
            pbar.update(len(records['query_id']))
    return rerank_run