Esempio n. 1
0
def validate(passages, answers, closest_docs, query_embedding2id, passage_embedding2id):

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    logger.info('Matching answers in top docs...')
    scores = []
    for query_idx in range(closest_docs.shape[0]): 
        query_id = query_embedding2id[query_idx]
        doc_ids = [passage_embedding2id[pidx] for pidx in closest_docs[query_idx]]
        hits = []
        for i, doc_id in enumerate(doc_ids):
            text = passages[doc_id][0]
            hits.append(has_answer(answers[query_id], text, tokenizer))
        scores.append(hits)

    logger.info('Per question validation results len=%d', len(scores))

    n_docs = len(closest_docs[0])
    top_k_hits = [0] * n_docs
    for question_hits in scores:
        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
        if best_hit is not None:
            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]

    logger.info('Validation results: top k documents hits %s', top_k_hits)
    top_k_hits = [v / len(closest_docs) for v in top_k_hits]
    logger.info('Validation results: top k documents hits accuracy %s', top_k_hits)
    return top_k_hits
Esempio n. 2
0
def GenerateNegativeQueryID(args, passages, answers, query_embedding2id, passage_embedding2id, closest_ans, training_query_positive_id_inversed):
    passage_negative_query = {}

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    for passage_idx in range(closest_ans.shape[0]):
        passage_id = passage_embedding2id[passage_idx]

        pos_qid_list = training_query_positive_id_inversed[passage_id]
        q_ids = [query_embedding2id[qidx] for qidx in closest_ans[passage_idx] ]

        passage_negative_query[passage_id] = []
        neg_cnt = 0

        text = passages[passage_id][0]

        for q_id in q_ids:
            if q_id in pos_qid_list:
                continue
            if q_id in passage_negative_query[passage_id]:
                continue
            if neg_cnt >= args.negative_sample:
                break
            if not has_answer(answers[q_id],text,tokenizer):
                passage_negative_query[passage_id].append(q_id)
            neg_cnt+=1

    return passage_negative_query
Esempio n. 3
0
def GenerateNegativePassaageID(args, passages, answers, query_embedding2id, passage_embedding2id, closest_docs, training_query_positive_id):
    query_negative_passage = {}

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    for query_idx in range(closest_docs.shape[0]): 
        query_id = query_embedding2id[query_idx]

        pos_pid = training_query_positive_id[query_id]
        doc_ids = [passage_embedding2id[pidx] for pidx in closest_docs[query_idx]]

        query_negative_passage[query_id] = []
        neg_cnt = 0

        for doc_id in doc_ids:
            if doc_id == pos_pid:
                continue
            if doc_id in query_negative_passage[query_id]:
                continue
            if neg_cnt >= args.negative_sample:
                break
            
            text = passages[doc_id][0]
            if not has_answer(answers[query_id], text, tokenizer):
                query_negative_passage[query_id].append(doc_id)
            neg_cnt+=1 # BUG?

    return query_negative_passage
Esempio n. 4
0
def validate(pred_file_path, n_docs, dpr_result=False):
    print('Validating: ', pred_file_path, flush=True)
    prediction = json.load(open(pred_file_path))

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    print('Matching answers in top docs...')
    scores = []
    for query_ans_doc in prediction:
        question = query_ans_doc['question']
        answers = query_ans_doc['answers']
        if not dpr_result:
            cxts = query_ans_doc['ctxs'][:n_docs]
        else:
            try:
                cxts = [
                    d for d in query_ans_doc['positive_ctxs']
                    if d['title_score'] != 1.0
                ] + query_ans_doc['negative_ctxs']
                cxts.sort(reverse=True, key=lambda d: d['score'])
                cxts = cxts[:n_docs]
            except:
                cxts = query_ans_doc['ctxs'][:n_docs]

        hits = []

        for i, doc in enumerate(cxts):
            if dpr_result: doc = doc['text']
            hits.append(has_answer(answers, doc, tokenizer))
        scores.append(hits)

    print('Per question validation results len=%d', len(scores))

    top_k_hits = [0] * n_docs
    for question_hits in scores:
        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
        if best_hit is not None:
            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]

    print('Validation results: top k documents hits %s', top_k_hits)
    top_k_hits = [v / len(prediction) for v in top_k_hits]
    print('Validation results: top k documents hits accuracy %s', top_k_hits)
    print('Validation results: top 20/100 documents hits accuracy ',
          top_k_hits[19], '/', top_k_hits[99])

    return top_k_hits
Esempio n. 5
0
def validate(passages, answers, closest_docs, query_embedding2id, passage_embedding2id, questions, prediction_ouput_file=None):

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    prediction = []

    logger.info('Matching answers in top docs...')
    scores = []
    for query_idx in range(closest_docs.shape[0]): 
        query_id = query_embedding2id[query_idx]

        try:
            doc_ids = [passage_embedding2id[pidx] for pidx in closest_docs[query_idx]]
        except:
            import pdb
            pdb.set_trace()
        hits = []
        cxts = []
        for i, doc_id in enumerate(doc_ids):
            text = passages[doc_id][0]
            hits.append(has_answer(answers[query_id], text, tokenizer))
            cxts.append({'id': str(doc_id), 'text': text, 'title': passages[doc_id][1]})
        prediction.append({'question': questions[query_id], 'answers': answers[query_id], 'ctxs': cxts})
        scores.append(hits)

    logger.info('Per question validation results len=%d', len(scores))
    logger.info('prediction size=%d', len(prediction))

    n_docs = len(closest_docs[0])
    top_k_hits = [0] * n_docs
    for question_hits in scores:
        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
        if best_hit is not None:
            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]

    logger.info('Validation results: top k documents hits %s', top_k_hits)
    top_k_hits = [v / len(closest_docs) for v in top_k_hits]
    logger.info('Validation results: top k documents hits accuracy %s', top_k_hits)


    if prediction_ouput_file:
        with open(prediction_ouput_file, 'w') as f:
            json.dump(prediction, f)

    return top_k_hits