Esempio n. 1
0
def main(args):
    with open(args.input_path, encoding='utf-8') as f:
        data = json.load(f)

    output_data = []

    print('processing input')
    for sample_id in tqdm(data):
        sample = data[sample_id]

        question = sample['question']
        answers = sample['answer']
        predictions = sample['prediction'][0:args.top_k]
        titles = sample['title'][0:args.top_k]
        evidences = sample['evidence'][0:args.top_k]
        scores = sample['score'][0:args.top_k]

        is_impossible = []
        start_pos = []
        end_pos = []

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score

        #answer_text = ""
        #ds_context = ""
        #ds_title = ""
        # is_from_context = False

        # check if prediction is matched in a golden answer in the answer list
        for pred_idx, pred in enumerate(predictions):
            if pred != "" and drqa_metric_max_over_ground_truths(match_fn, pred, answers):
                is_impossible.append(False)
                answer_start = find_substring_and_return_first(pred, evidences[pred_idx])
                start_pos.append(answer_start)
                end_pos.append(answer_start + len(pred)-1)
            else:
                is_impossible.append(True)
                start_pos.append(-1)
                end_pos.append(-1)

        # could add score in here if we somehow make the reader also take the score into account
        output_data.append({
                'question': question,
                'answer': answers,
                'prediction': predictions,
                'title': [title[0] for title in titles],
                'evidence': evidences,
                'is_impossible': is_impossible,
                'start_pos': start_pos,
                'end_pos': end_pos,
                'score': scores,
                })

    with open(args.output_path, 'w', encoding='utf-8') as f:
        json.dump({
            'data': output_data
        },f)
Esempio n. 2
0
def get_phrase_vecs(mips, questions, answers, outs, top_k=100, regex=False):
    assert mips is not None

    # Get phrase and vectors
    phrase_idxs = [[(out_['doc_idx'], out_['start_idx'], out_['end_idx'],
                     out_['answer'], out_['start_vec'], out_['end_vec'])
                    for out_ in out] for out in outs]
    for b_idx, phrase_idx in enumerate(phrase_idxs):
        while len(phrase_idxs[b_idx]
                  ) < top_k * 2:  # two separate top-k from start/end
            phrase_idxs[b_idx].append((-1, 0, 0, '', np.zeros(
                (768)), np.zeros((768))))
        phrase_idxs[b_idx] = phrase_idxs[b_idx][:top_k * 2]
    flat_phrase_idxs = [
        phrase for phrase_idx in phrase_idxs for phrase in phrase_idx
    ]
    doc_idxs = [int(phrase_idx_[0]) for phrase_idx_ in flat_phrase_idxs]
    start_idxs = [int(phrase_idx_[1]) for phrase_idx_ in flat_phrase_idxs]
    end_idxs = [int(phrase_idx_[2]) for phrase_idx_ in flat_phrase_idxs]
    phrases = [phrase_idx_[3] for phrase_idx_ in flat_phrase_idxs]
    start_vecs = [phrase_idx_[4] for phrase_idx_ in flat_phrase_idxs]
    end_vecs = [phrase_idx_[5] for phrase_idx_ in flat_phrase_idxs]

    start_vecs = np.stack(
        # [mips.dequant(mips.offset, mips.scale, start_vec) # Use this for IVFSQ4
        [start_vec for start_vec, start_idx in zip(start_vecs, start_idxs)])

    end_vecs = np.stack(
        # [mips.dequant(mips.offset, mips.scale, end_vec) # Use this for IVFSQ4
        [end_vec for end_vec, end_idx in zip(end_vecs, end_idxs)])

    zero_mask = np.array([[1] if doc_idx >= 0 else [0]
                          for doc_idx in doc_idxs])
    start_vecs = start_vecs * zero_mask
    end_vecs = end_vecs * zero_mask

    # Find targets based on exact string match
    match_fn = drqa_regex_match_score if regex else drqa_exact_match_score  # Punctuation included
    targets = [[
        drqa_metric_max_over_ground_truths(match_fn, phrase[3], answer)
        for phrase in phrase_idx
    ] for phrase_idx, answer in zip(phrase_idxs, answers)]
    targets = [[ii if val else None for ii, val in enumerate(target)]
               for target in targets]

    # Reshape
    batch_size = len(answers)
    start_vecs = np.reshape(start_vecs, (batch_size, top_k * 2, -1))
    end_vecs = np.reshape(end_vecs, (batch_size, top_k * 2, -1))
    return start_vecs, end_vecs, targets
def evaluate_results(predictions,
                     qids,
                     questions,
                     answers,
                     args,
                     evidences,
                     scores,
                     titles,
                     se_positions=None):
    # Filter if there's candidate
    if args.candidate_path is not None:
        candidates = set()
        with open(args.candidate_path) as f:
            for line in f:
                line = line.strip().lower()
                candidates.add(line)
        logger.info(
            f'{len(candidates)} candidates are loaded from {args.candidate_path}'
        )
        topk_preds = [
            list(
                filter(
                    lambda x: (x in candidates) or (x.lower() in candidates),
                    a)) for a in predictions
        ]
        topk_preds = [
            a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds
        ]
        predictions = topk_preds[:]
        top1_preds = [a[0] for a in topk_preds]
    else:
        predictions = [
            a[:args.top_k] if len(a) > 0 else [''] for a in predictions
        ]
        top1_preds = [a[0] for a in predictions]
    no_ans = sum([a == '' for a in top1_preds])
    logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}')
    logger.info(f'Evaluating {len(top1_preds)} answers')

    # Get em/f1
    f1s, ems = [], []
    for prediction, groundtruth in zip(top1_preds, answers):
        if len(groundtruth) == 0:
            f1s.append(0)
            ems.append(0)
            continue
        f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth]))
        ems.append(
            max([exact_match_score(prediction, gt) for gt in groundtruth]))
    final_f1, final_em = np.mean(f1s), np.mean(ems)
    if not args.regex:
        logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100))

    def long_answer_em(prediction, groundtruths, se_pos):
        em = []
        for gt in groundtruths:
            start = prediction.find(gt)
            if start != -1 and se_pos[0] >= start and se_pos[1] <= start + len(
                    gt):
                em.append(1)
            else:
                em.append(0)
        return max(em)

    # Top 1/k em (or regex em)
    exact_match_topk = 0
    exact_match_top1 = 0
    f1_score_topk = 0
    f1_score_top1 = 0
    redundant_topk = 0
    pred_out = {}
    if "question_type" in args and "4" in args.question_type:
        exact_match_long_topk = 0
        exact_match_long_top1 = 0
    for i in range(len(predictions)):
        # For debugging
        if i < 10:
            logger.info(f'{i+1}) {questions[i]}')
            logger.info(
                f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}'
            )

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score
        em_topk = max([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               answers[i])
            for prediction in predictions[i][:args.top_k]
        ])
        em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i],
                                                     answers[i])
        exact_match_topk += em_topk
        exact_match_top1 += em_top1

        if "question_type" in args and "4" in args.question_type:
            em_long_topk = [
                long_answer_em(prediction, answers[i], se)
                for prediction, se in zip(predictions[i][:args.top_k],
                                          se_positions[i][:args.top_k])
            ]
            exact_match_long_topk += max(em_long_topk)
            exact_match_long_top1 += em_long_topk[0]
            if i < 10:
                logger.info(
                    f"Top 1 score {exact_match_long_top1}, top 10 score {exact_match_long_topk}"
                )

        # Compute top-k redundancy (could be ill-defined for regex)
        rd_topk = sum([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               [predictions[i][0]])
            for prediction in predictions[i][:args.top_k]
        ])
        redundant_topk += rd_topk

        f1_topk = 0
        f1_top1 = 0
        if not args.regex:
            match_fn = lambda x, y: f1_score(x, y)[0]
            f1_topk = max([
                drqa_metric_max_over_ground_truths(match_fn, prediction,
                                                   answers[i])
                for prediction in predictions[i][:args.top_k]
            ])
            f1_top1 = drqa_metric_max_over_ground_truths(
                match_fn, top1_preds[i], answers[i])
            f1_score_topk += f1_topk
            f1_score_top1 += f1_top1

        # Score statistics
        assert len(predictions[i]) <= args.top_k
        pred_out[qids[i]] = {
            'question': questions[i],
            'answer': answers[i],
            'prediction': predictions[i],
            'score': scores[i],
            'title': titles[i],
            'evidence': evidences[i] if evidences is not None else '',
            'em_top1': bool(em_top1),
            f'em_top{args.top_k}': bool(em_topk),
            'f1_top1': f1_top1,
            f'f1_top{args.top_k}': f1_topk,
            'se_pos': se_positions[i] if se_positions is not None else
            (-1, -1),
            'rd_topk': rd_topk,
        }

    if "question_type" in args and "4" in args.question_type:
        exact_match_top1 = exact_match_long_top1
        exact_match_topk = exact_match_long_topk

    total = len(predictions)
    exact_match_top1 = 100.0 * exact_match_top1 / total
    f1_score_top1 = 100.0 * f1_score_top1 / total
    logger.info({
        'exact_match_top1': exact_match_top1,
        'f1_score_top1': f1_score_top1
    })
    exact_match_topk = 100.0 * exact_match_topk / total
    f1_score_topk = 100.0 * f1_score_topk / total
    logger.info({
        f'exact_match_top{args.top_k}': exact_match_topk,
        f'f1_score_top{args.top_k}': f1_score_topk
    })
    redundant_topk = redundant_topk / total
    logger.info({f'redundancy of top{args.top_k}': redundant_topk})

    # Dump predictions
    if len(args.load_dir) == 0:
        pred_dir = os.path.join(os.environ['SAVE_DIR'], 'pred')
    else:
        pred_dir = os.path.join(args.load_dir, 'pred')
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)

    if args.save_pred:
        pred_path = os.path.join(
            pred_dir,
            os.path.splitext(os.path.basename(args.test_path))[0] +
            f'_{total}_top{args.top_k}.pred')
        logger.info(f'Saving prediction file to {pred_path}')
        with open(pred_path, 'w') as f:
            json.dump(pred_out, f)

    logger.info(
        f"Results for {args.test_path}: exact match top 1: {exact_match_top1} | f1 score top 1: {f1_score_top1} | exact match top {args.top_k}: {exact_match_topk} | f1 score top {args.top_k}: {f1_score_topk}"
    )
    if args.wandb:
        wandb.log({
            "Test Path": args.test_path,
            "Model": args.load_dir,
            "Top 1 EM": exact_match_top1,
            "Top 1 F1": f1_score_top1,
            f"Top {args.top_k} EM": exact_match_topk,
            f"Top {args.top_k} F1": f1_score_topk
        })
        wandb_table = wandb.Table(columns=[
            "Model", "Question Type", "Top 1 EM", "Top 1 F1",
            f"Top {args.top_k} EM", f"Top {args.top_k} F1"
        ])
        wandb_table.add_data(args.load_dir, args.question_type,
                             exact_match_top1, f1_score_top1, exact_match_topk,
                             f1_score_topk)
        wandb.log({args.test_path: wandb_table})

    # Evaluate passage retrieval
    if args.eval_psg:
        evaluate_results_psg(pred_path, args)

    return exact_match_top1, f1_score_top1, exact_match_topk, f1_score_topk
Esempio n. 4
0
def annotate_phrase_vecs(mips, q_ids, questions, answers, titles,
                         phrase_groups, args):
    assert mips is not None
    batch_size = len(answers)

    # Phrase groups are in size of [batch, top_k, values]
    # phrase_groups = [[(
    #     out_['doc_idx'], out_['start_idx'], out_['end_idx'], out_['answer'],
    #     out_['start_vec'], out_['end_vec'], out_['context'], out_['title'])
    #     for out_ in out] for out in outs
    # ]
    dummy_group = {
        'doc_idx': -1,
        'start_idx': 0,
        'end_idx': 0,
        'answer': '',
        'start_vec': np.zeros(768),
        'end_vec': np.zeros(768),
        'context': '',
        'title': ['']
    }

    # Pad phrase groups (two separate top-k coming from start/end, so pad with top_k*2)
    for b_idx, phrase_idx in enumerate(phrase_groups):
        while len(phrase_groups[b_idx]) < args.top_k * 2:
            phrase_groups[b_idx].append(dummy_group)
        assert len(phrase_groups[b_idx]) == args.top_k * 2

    # Flatten phrase groups
    flat_phrase_groups = [
        phrase for phrase_group in phrase_groups for phrase in phrase_group
    ]
    doc_idxs = [
        int(phrase_group['doc_idx']) for phrase_group in flat_phrase_groups
    ]
    start_vecs = [
        phrase_group['start_vec'] for phrase_group in flat_phrase_groups
    ]
    end_vecs = [phrase_group['end_vec'] for phrase_group in flat_phrase_groups]

    # stack vectors
    start_vecs = np.stack(start_vecs)
    end_vecs = np.stack(end_vecs)
    zero_mask = np.array([[1] if doc_idx >= 0 else [0]
                          for doc_idx in doc_idxs])
    start_vecs = start_vecs * zero_mask
    end_vecs = end_vecs * zero_mask

    # Reshape
    start_vecs = np.reshape(start_vecs, (batch_size, args.top_k * 2, -1))
    end_vecs = np.reshape(end_vecs, (batch_size, args.top_k * 2, -1))

    # Dummy targets
    targets = [[None for phrase in phrase_group]
               for phrase_group in phrase_groups]
    p_targets = [[None for phrase in phrase_group]
                 for phrase_group in phrase_groups]

    # TODO: implement dynamic label_strategy based on the task name (label_strat = dynamic)

    # Annotate for L_phrase
    if 'phrase' in args.label_strat.split(','):
        match_fns = [
            drqa_regex_match_score if args.regex or
            ('trec' in q_id.lower()) else drqa_exact_match_score
            for q_id in q_ids
        ]
        targets = [[
            drqa_metric_max_over_ground_truths(match_fn, phrase['answer'],
                                               answer_set)
            for phrase in phrase_group
        ] for phrase_group, answer_set, match_fn in zip(
            phrase_groups, answers, match_fns)]
        targets = [[ii if val else None for ii, val in enumerate(target)]
                   for target in targets]

    # Annotate for L_doc
    if 'doc' in args.label_strat.split(','):
        p_targets = [[
            any(phrase['title'][0].lower() == tit.lower() for tit in title)
            for phrase in phrase_group
        ] for phrase_group, title in zip(phrase_groups, titles)]
        p_targets = [[ii if val else None for ii, val in enumerate(target)]
                     for target in p_targets]

    return start_vecs, end_vecs, targets, p_targets
Esempio n. 5
0
def evaluate_results(predictions,
                     qids,
                     questions,
                     answers,
                     args,
                     evidences,
                     scores,
                     titles,
                     q_tokens=None):
    wandb.init(project="DensePhrases (open)",
               mode="online" if args.wandb else "disabled")
    wandb.config.update(args)

    # Filter if there's candidate
    if args.candidate_path is not None:
        candidates = set()
        with open(args.candidate_path) as f:
            for line in f:
                line = line.strip().lower()
                candidates.add(line)
        logger.info(
            f'{len(candidates)} candidates are loaded from {args.candidate_path}'
        )
        topk_preds = [
            list(
                filter(
                    lambda x: (x in candidates) or (x.lower() in candidates),
                    a)) for a in predictions
        ]
        topk_preds = [
            a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds
        ]
        predictions = topk_preds[:]
        top1_preds = [a[0] for a in topk_preds]
    else:
        predictions = [
            a[:args.top_k] if len(a) > 0 else [''] for a in predictions
        ]
        top1_preds = [a[0] for a in predictions]
    no_ans = sum([a == '' for a in top1_preds])
    logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}')
    logger.info(f'Evaluating {len(top1_preds)} answers.')

    # Get em/f1
    f1s, ems = [], []
    for prediction, groundtruth in zip(top1_preds, answers):
        if len(groundtruth) == 0:
            f1s.append(0)
            ems.append(0)
            continue
        f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth]))
        ems.append(
            max([exact_match_score(prediction, gt) for gt in groundtruth]))
    final_f1, final_em = np.mean(f1s), np.mean(ems)
    logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100))

    # Top 1/k em (or regex em)
    exact_match_topk = 0
    exact_match_top1 = 0
    f1_score_topk = 0
    f1_score_top1 = 0
    pred_out = {}
    for i in range(len(predictions)):
        # For debugging
        if i < 3:
            logger.info(f'{i+1}) {questions[i]}')
            logger.info(
                f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}'
            )

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score
        em_topk = max([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               answers[i])
            for prediction in predictions[i][:args.top_k]
        ])
        em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i],
                                                     answers[i])
        exact_match_topk += em_topk
        exact_match_top1 += em_top1

        f1_topk = 0
        f1_top1 = 0
        if not args.regex:
            match_fn = lambda x, y: f1_score(x, y)[0]
            f1_topk = max([
                drqa_metric_max_over_ground_truths(match_fn, prediction,
                                                   answers[i])
                for prediction in predictions[i][:args.top_k]
            ])
            f1_top1 = drqa_metric_max_over_ground_truths(
                match_fn, top1_preds[i], answers[i])
            f1_score_topk += f1_topk
            f1_score_top1 += f1_top1

        pred_out[qids[i]] = {
            'question': questions[i],
            'answer': answers[i],
            'prediction': predictions[i],
            'score': scores[i],
            'title': titles[i],
            'evidence': evidences[i] if evidences is not None else '',
            'em_top1': bool(em_top1),
            f'em_top{args.top_k}': bool(em_topk),
            'f1_top1': f1_top1,
            f'f1_top{args.top_k}': f1_topk,
            'q_tokens': q_tokens[i] if q_tokens is not None else ['']
        }

    total = len(predictions)
    exact_match_top1 = 100.0 * exact_match_top1 / total
    f1_score_top1 = 100.0 * f1_score_top1 / total
    logger.info({
        'exact_match_top1': exact_match_top1,
        'f1_score_top1': f1_score_top1
    })
    exact_match_topk = 100.0 * exact_match_topk / total
    f1_score_topk = 100.0 * f1_score_topk / total
    logger.info({
        f'exact_match_top{args.top_k}': exact_match_topk,
        f'f1_score_top{args.top_k}': f1_score_topk
    })
    wandb.log({
        "Top1 EM": exact_match_top1,
        "Top1 F1": f1_score_top1,
        "Topk EM": exact_match_topk,
        "Topk F1": f1_score_topk
    })

    # Dump predictions
    if len(args.query_encoder_path) == 0:
        pred_dir = os.path.join(os.environ['DPH_SAVE_DIR'], 'pred')
    else:
        pred_dir = os.path.join(args.query_encoder_path, 'pred')
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    pred_path = os.path.join(
        pred_dir,
        os.path.splitext(os.path.basename(args.test_path))[0] +
        f'_{total}.pred')
    logger.info(f'Saving prediction file to {pred_path}')
    with open(pred_path, 'w') as f:
        json.dump(pred_out, f)

    return exact_match_top1
Esempio n. 6
0
def main(args):
    with open(args.input_path, encoding='utf-8') as f:
        data = json.load(f)

    output_data = []
    nlp = English()
    sentencizer = nlp.add_pipe('sentencizer')

    for i, sample_id in enumerate(tqdm(data, desc="Processing Inputs")):
        sample = data[sample_id]

        question = sample['question']
        answers = sample['answer']
        predictions = sample['prediction'][0:args.top_k]
        titles = sample['title'][0:args.top_k]
        evidences = sample['evidence'][0:args.top_k]
        scores = sample['score'][0:args.top_k]
        se_pos = sample['se_pos'][0:args.top_k]

        f1s = [
            max([f1_score(prediction, gt)[0] for gt in answers])
            for prediction in predictions
        ]
        ems = [
            max([exact_match_score(prediction, gt) for gt in answers])
            for prediction in predictions
        ]

        # not a useful example if there are no exact match
        if max(ems) < 1:
            continue

        is_impossible = []
        start_pos = []
        end_pos = []

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score

        # only check if prediction is matched in a golden answer in the answer list
        for pred_idx, pred in enumerate(predictions):
            if pred != "" and drqa_metric_max_over_ground_truths(
                    match_fn, pred, answers):
                is_impossible.append(False)
                answer_start = find_substring_and_return_first(
                    pred, evidences[pred_idx])
                start_pos.append(answer_start)
                end_pos.append(answer_start + len(pred) - 1)
            else:
                is_impossible.append(True)
                start_pos.append(-1)
                end_pos.append(-1)

        docs = [nlp(evi) for evi in evidences]
        spans = [
            doc.char_span(
                se_pos[j][0],
                se_pos[j][1] if se_pos[j][1] >= se_pos[j][0] else se_pos[j][0],
                alignment_mode='expand') for j, doc in enumerate(docs)
        ]
        sents = [
            doc[span[0].sent.start:span[-1].sent.end].text
            if len(span) > 0 else "" for doc, span in zip(docs, spans)
        ]

        # could add score in here if we somehow make the reader also take the score into account
        output_data.append({
            'question': question,
            'answer': answers,
            'prediction': predictions,
            'title': [title[0] for title in titles],
            'evidence': evidences,
            'is_impossible': is_impossible,
            'start_pos': start_pos,
            'end_pos': end_pos,
            'score': scores,
            'f1s': f1s,
            'exact_matches': ems,
            'se_pos': se_pos,
            'sentence': sents,
        })

    with open(args.output_path, 'w', encoding='utf-8') as f:
        json.dump({'data': output_data}, f)
Esempio n. 7
0
 def top1(fn):
     return drqa_metric_max_over_ground_truths(
         fn, top1_preds[i], answers[i])
Esempio n. 8
0
 def topk(fn):
     return max([
         drqa_metric_max_over_ground_truths(fn, p, answers[i])
         for p in predictions[i][:args.top_k]
     ])