def evaluate_results(predictions,
                     qids,
                     questions,
                     answers,
                     args,
                     evidences,
                     scores,
                     titles,
                     se_positions=None):
    # Filter if there's candidate
    if args.candidate_path is not None:
        candidates = set()
        with open(args.candidate_path) as f:
            for line in f:
                line = line.strip().lower()
                candidates.add(line)
        logger.info(
            f'{len(candidates)} candidates are loaded from {args.candidate_path}'
        )
        topk_preds = [
            list(
                filter(
                    lambda x: (x in candidates) or (x.lower() in candidates),
                    a)) for a in predictions
        ]
        topk_preds = [
            a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds
        ]
        predictions = topk_preds[:]
        top1_preds = [a[0] for a in topk_preds]
    else:
        predictions = [
            a[:args.top_k] if len(a) > 0 else [''] for a in predictions
        ]
        top1_preds = [a[0] for a in predictions]
    no_ans = sum([a == '' for a in top1_preds])
    logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}')
    logger.info(f'Evaluating {len(top1_preds)} answers')

    # Get em/f1
    f1s, ems = [], []
    for prediction, groundtruth in zip(top1_preds, answers):
        if len(groundtruth) == 0:
            f1s.append(0)
            ems.append(0)
            continue
        f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth]))
        ems.append(
            max([exact_match_score(prediction, gt) for gt in groundtruth]))
    final_f1, final_em = np.mean(f1s), np.mean(ems)
    if not args.regex:
        logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100))

    def long_answer_em(prediction, groundtruths, se_pos):
        em = []
        for gt in groundtruths:
            start = prediction.find(gt)
            if start != -1 and se_pos[0] >= start and se_pos[1] <= start + len(
                    gt):
                em.append(1)
            else:
                em.append(0)
        return max(em)

    # Top 1/k em (or regex em)
    exact_match_topk = 0
    exact_match_top1 = 0
    f1_score_topk = 0
    f1_score_top1 = 0
    redundant_topk = 0
    pred_out = {}
    if "question_type" in args and "4" in args.question_type:
        exact_match_long_topk = 0
        exact_match_long_top1 = 0
    for i in range(len(predictions)):
        # For debugging
        if i < 10:
            logger.info(f'{i+1}) {questions[i]}')
            logger.info(
                f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}'
            )

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score
        em_topk = max([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               answers[i])
            for prediction in predictions[i][:args.top_k]
        ])
        em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i],
                                                     answers[i])
        exact_match_topk += em_topk
        exact_match_top1 += em_top1

        if "question_type" in args and "4" in args.question_type:
            em_long_topk = [
                long_answer_em(prediction, answers[i], se)
                for prediction, se in zip(predictions[i][:args.top_k],
                                          se_positions[i][:args.top_k])
            ]
            exact_match_long_topk += max(em_long_topk)
            exact_match_long_top1 += em_long_topk[0]
            if i < 10:
                logger.info(
                    f"Top 1 score {exact_match_long_top1}, top 10 score {exact_match_long_topk}"
                )

        # Compute top-k redundancy (could be ill-defined for regex)
        rd_topk = sum([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               [predictions[i][0]])
            for prediction in predictions[i][:args.top_k]
        ])
        redundant_topk += rd_topk

        f1_topk = 0
        f1_top1 = 0
        if not args.regex:
            match_fn = lambda x, y: f1_score(x, y)[0]
            f1_topk = max([
                drqa_metric_max_over_ground_truths(match_fn, prediction,
                                                   answers[i])
                for prediction in predictions[i][:args.top_k]
            ])
            f1_top1 = drqa_metric_max_over_ground_truths(
                match_fn, top1_preds[i], answers[i])
            f1_score_topk += f1_topk
            f1_score_top1 += f1_top1

        # Score statistics
        assert len(predictions[i]) <= args.top_k
        pred_out[qids[i]] = {
            'question': questions[i],
            'answer': answers[i],
            'prediction': predictions[i],
            'score': scores[i],
            'title': titles[i],
            'evidence': evidences[i] if evidences is not None else '',
            'em_top1': bool(em_top1),
            f'em_top{args.top_k}': bool(em_topk),
            'f1_top1': f1_top1,
            f'f1_top{args.top_k}': f1_topk,
            'se_pos': se_positions[i] if se_positions is not None else
            (-1, -1),
            'rd_topk': rd_topk,
        }

    if "question_type" in args and "4" in args.question_type:
        exact_match_top1 = exact_match_long_top1
        exact_match_topk = exact_match_long_topk

    total = len(predictions)
    exact_match_top1 = 100.0 * exact_match_top1 / total
    f1_score_top1 = 100.0 * f1_score_top1 / total
    logger.info({
        'exact_match_top1': exact_match_top1,
        'f1_score_top1': f1_score_top1
    })
    exact_match_topk = 100.0 * exact_match_topk / total
    f1_score_topk = 100.0 * f1_score_topk / total
    logger.info({
        f'exact_match_top{args.top_k}': exact_match_topk,
        f'f1_score_top{args.top_k}': f1_score_topk
    })
    redundant_topk = redundant_topk / total
    logger.info({f'redundancy of top{args.top_k}': redundant_topk})

    # Dump predictions
    if len(args.load_dir) == 0:
        pred_dir = os.path.join(os.environ['SAVE_DIR'], 'pred')
    else:
        pred_dir = os.path.join(args.load_dir, 'pred')
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)

    if args.save_pred:
        pred_path = os.path.join(
            pred_dir,
            os.path.splitext(os.path.basename(args.test_path))[0] +
            f'_{total}_top{args.top_k}.pred')
        logger.info(f'Saving prediction file to {pred_path}')
        with open(pred_path, 'w') as f:
            json.dump(pred_out, f)

    logger.info(
        f"Results for {args.test_path}: exact match top 1: {exact_match_top1} | f1 score top 1: {f1_score_top1} | exact match top {args.top_k}: {exact_match_topk} | f1 score top {args.top_k}: {f1_score_topk}"
    )
    if args.wandb:
        wandb.log({
            "Test Path": args.test_path,
            "Model": args.load_dir,
            "Top 1 EM": exact_match_top1,
            "Top 1 F1": f1_score_top1,
            f"Top {args.top_k} EM": exact_match_topk,
            f"Top {args.top_k} F1": f1_score_topk
        })
        wandb_table = wandb.Table(columns=[
            "Model", "Question Type", "Top 1 EM", "Top 1 F1",
            f"Top {args.top_k} EM", f"Top {args.top_k} F1"
        ])
        wandb_table.add_data(args.load_dir, args.question_type,
                             exact_match_top1, f1_score_top1, exact_match_topk,
                             f1_score_topk)
        wandb.log({args.test_path: wandb_table})

    # Evaluate passage retrieval
    if args.eval_psg:
        evaluate_results_psg(pred_path, args)

    return exact_match_top1, f1_score_top1, exact_match_topk, f1_score_topk
Beispiel #2
0
def evaluate_results(predictions,
                     qids,
                     questions,
                     answers,
                     args,
                     evidences,
                     scores,
                     titles,
                     q_tokens=None):
    wandb.init(project="DensePhrases (open)",
               mode="online" if args.wandb else "disabled")
    wandb.config.update(args)

    # Filter if there's candidate
    if args.candidate_path is not None:
        candidates = set()
        with open(args.candidate_path) as f:
            for line in f:
                line = line.strip().lower()
                candidates.add(line)
        logger.info(
            f'{len(candidates)} candidates are loaded from {args.candidate_path}'
        )
        topk_preds = [
            list(
                filter(
                    lambda x: (x in candidates) or (x.lower() in candidates),
                    a)) for a in predictions
        ]
        topk_preds = [
            a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds
        ]
        predictions = topk_preds[:]
        top1_preds = [a[0] for a in topk_preds]
    else:
        predictions = [
            a[:args.top_k] if len(a) > 0 else [''] for a in predictions
        ]
        top1_preds = [a[0] for a in predictions]
    no_ans = sum([a == '' for a in top1_preds])
    logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}')
    logger.info(f'Evaluating {len(top1_preds)} answers.')

    # Get em/f1
    f1s, ems = [], []
    for prediction, groundtruth in zip(top1_preds, answers):
        if len(groundtruth) == 0:
            f1s.append(0)
            ems.append(0)
            continue
        f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth]))
        ems.append(
            max([exact_match_score(prediction, gt) for gt in groundtruth]))
    final_f1, final_em = np.mean(f1s), np.mean(ems)
    logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100))

    # Top 1/k em (or regex em)
    exact_match_topk = 0
    exact_match_top1 = 0
    f1_score_topk = 0
    f1_score_top1 = 0
    pred_out = {}
    for i in range(len(predictions)):
        # For debugging
        if i < 3:
            logger.info(f'{i+1}) {questions[i]}')
            logger.info(
                f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}'
            )

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score
        em_topk = max([
            drqa_metric_max_over_ground_truths(match_fn, prediction,
                                               answers[i])
            for prediction in predictions[i][:args.top_k]
        ])
        em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i],
                                                     answers[i])
        exact_match_topk += em_topk
        exact_match_top1 += em_top1

        f1_topk = 0
        f1_top1 = 0
        if not args.regex:
            match_fn = lambda x, y: f1_score(x, y)[0]
            f1_topk = max([
                drqa_metric_max_over_ground_truths(match_fn, prediction,
                                                   answers[i])
                for prediction in predictions[i][:args.top_k]
            ])
            f1_top1 = drqa_metric_max_over_ground_truths(
                match_fn, top1_preds[i], answers[i])
            f1_score_topk += f1_topk
            f1_score_top1 += f1_top1

        pred_out[qids[i]] = {
            'question': questions[i],
            'answer': answers[i],
            'prediction': predictions[i],
            'score': scores[i],
            'title': titles[i],
            'evidence': evidences[i] if evidences is not None else '',
            'em_top1': bool(em_top1),
            f'em_top{args.top_k}': bool(em_topk),
            'f1_top1': f1_top1,
            f'f1_top{args.top_k}': f1_topk,
            'q_tokens': q_tokens[i] if q_tokens is not None else ['']
        }

    total = len(predictions)
    exact_match_top1 = 100.0 * exact_match_top1 / total
    f1_score_top1 = 100.0 * f1_score_top1 / total
    logger.info({
        'exact_match_top1': exact_match_top1,
        'f1_score_top1': f1_score_top1
    })
    exact_match_topk = 100.0 * exact_match_topk / total
    f1_score_topk = 100.0 * f1_score_topk / total
    logger.info({
        f'exact_match_top{args.top_k}': exact_match_topk,
        f'f1_score_top{args.top_k}': f1_score_topk
    })
    wandb.log({
        "Top1 EM": exact_match_top1,
        "Top1 F1": f1_score_top1,
        "Topk EM": exact_match_topk,
        "Topk F1": f1_score_topk
    })

    # Dump predictions
    if len(args.query_encoder_path) == 0:
        pred_dir = os.path.join(os.environ['DPH_SAVE_DIR'], 'pred')
    else:
        pred_dir = os.path.join(args.query_encoder_path, 'pred')
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    pred_path = os.path.join(
        pred_dir,
        os.path.splitext(os.path.basename(args.test_path))[0] +
        f'_{total}.pred')
    logger.info(f'Saving prediction file to {pred_path}')
    with open(pred_path, 'w') as f:
        json.dump(pred_out, f)

    return exact_match_top1
Beispiel #3
0
def main(args):
    with open(args.input_path, encoding='utf-8') as f:
        data = json.load(f)

    output_data = []
    nlp = English()
    sentencizer = nlp.add_pipe('sentencizer')

    for i, sample_id in enumerate(tqdm(data, desc="Processing Inputs")):
        sample = data[sample_id]

        question = sample['question']
        answers = sample['answer']
        predictions = sample['prediction'][0:args.top_k]
        titles = sample['title'][0:args.top_k]
        evidences = sample['evidence'][0:args.top_k]
        scores = sample['score'][0:args.top_k]
        se_pos = sample['se_pos'][0:args.top_k]

        f1s = [
            max([f1_score(prediction, gt)[0] for gt in answers])
            for prediction in predictions
        ]
        ems = [
            max([exact_match_score(prediction, gt) for gt in answers])
            for prediction in predictions
        ]

        # not a useful example if there are no exact match
        if max(ems) < 1:
            continue

        is_impossible = []
        start_pos = []
        end_pos = []

        match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score

        # only check if prediction is matched in a golden answer in the answer list
        for pred_idx, pred in enumerate(predictions):
            if pred != "" and drqa_metric_max_over_ground_truths(
                    match_fn, pred, answers):
                is_impossible.append(False)
                answer_start = find_substring_and_return_first(
                    pred, evidences[pred_idx])
                start_pos.append(answer_start)
                end_pos.append(answer_start + len(pred) - 1)
            else:
                is_impossible.append(True)
                start_pos.append(-1)
                end_pos.append(-1)

        docs = [nlp(evi) for evi in evidences]
        spans = [
            doc.char_span(
                se_pos[j][0],
                se_pos[j][1] if se_pos[j][1] >= se_pos[j][0] else se_pos[j][0],
                alignment_mode='expand') for j, doc in enumerate(docs)
        ]
        sents = [
            doc[span[0].sent.start:span[-1].sent.end].text
            if len(span) > 0 else "" for doc, span in zip(docs, spans)
        ]

        # could add score in here if we somehow make the reader also take the score into account
        output_data.append({
            'question': question,
            'answer': answers,
            'prediction': predictions,
            'title': [title[0] for title in titles],
            'evidence': evidences,
            'is_impossible': is_impossible,
            'start_pos': start_pos,
            'end_pos': end_pos,
            'score': scores,
            'f1s': f1s,
            'exact_matches': ems,
            'se_pos': se_pos,
            'sentence': sents,
        })

    with open(args.output_path, 'w', encoding='utf-8') as f:
        json.dump({'data': output_data}, f)
Beispiel #4
0
 def recall(x, y):
     return f1_score(x, y)[2]
Beispiel #5
0
 def precision(x, y):
     return f1_score(x, y)[1]
Beispiel #6
0
 def f1(x, y):
     return f1_score(x, y)[0]