Exemple #1
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        tydi_gold_dict = pickle.load(open(cache_path, 'r'))  # pytype: disable=wrong-arg-types
    else:
        tydi_gold_dict = eval_utils.read_annotation(FLAGS.gold_path)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for future to: %s',
                         format(cache_path))
            pickle.dump(tydi_gold_dict, open(cache_path, 'w'))  # pytype: disable=wrong-arg-types
    total_ans_count = 0
    count = 0

    for ans in tydi_gold_dict.values():
        count += 1
        gold_has_answer = eval_utils.gold_has_minimal_answer(
            ans, FLAGS.minimal_non_null_threshold)
        total_ans_count += gold_has_answer

    logging.info('%d examples have minimal answers', total_ans_count)
    logging.info('*' * 40)
    tydi_pred_dict = eval_utils.read_prediction_jsonl(FLAGS.predictions_path)

    per_lang_gold = {}
    per_lang_pred = {}

    for ex_id, ex in tydi_gold_dict.items():
        if ex[0].language in per_lang_gold:
            per_lang_gold[ex[0].language][ex_id] = ex
        else:
            per_lang_gold[ex[0].language] = {ex_id: ex}
    for ex_id, ex in tydi_pred_dict.items():
        if ex.language in per_lang_pred:
            per_lang_pred[ex.language][ex_id] = ex
        else:
            per_lang_pred[ex.language] = {ex_id: ex}

    macro_avg_passage_scores = ([], [], [])
    macro_avg_minimal_scores = ([], [], [])

    language_list = [
        'english', 'arabic', 'bengali', 'finnish', 'indonesian', 'japanese',
        'swahili', 'korean', 'russian', 'telugu', 'thai'
    ]
    for lang in language_list:
        if lang in per_lang_pred:
            passage_answer_stats, minimal_answer_stats = score_answers(
                per_lang_gold.get(lang, {}), per_lang_pred[lang])

            # Passage selection task
            opt_result, _ = compute_pr_curves(passage_answer_stats,
                                              targets=[0.5])
            f1, precision, recall, _ = opt_result
            if lang != 'english':
                macro_avg_passage_scores[0].append(f1)
                macro_avg_passage_scores[1].append(precision)
                macro_avg_passage_scores[2].append(recall)
            print('Passage & ' + lang + ' & ' +
                  get_latex_str(f1, precision, recall))

            # Minimal answer span task
            opt_result, _ = compute_pr_curves(minimal_answer_stats,
                                              targets=[0.5])
            f1, precision, recall, _ = opt_result
            if lang != 'english':
                macro_avg_minimal_scores[0].append(f1)
                macro_avg_minimal_scores[1].append(precision)
                macro_avg_minimal_scores[2].append(recall)
            print('Minimal Answer & ' + lang + ' & ' +
                  get_latex_str(f1, precision, recall))

            if FLAGS.pretty_print:
                print('*' * 20)
                print(lang)
                print('Language: %s (%d)' %
                      (lang, len(per_lang_gold.get(lang, {}))))
                print('*' * 20)
                print('PASSAGE ANSWER R@P TABLE:')
                print_r_at_p_table(passage_answer_stats)
                print('*' * 20)
                print('MINIMAL ANSWER R@P TABLE:')
                print_r_at_p_table(minimal_answer_stats)
            else:
                metrics = get_metrics_with_answer_stats(
                    passage_answer_stats, minimal_answer_stats)
                print(json.dumps(metrics))

    print(
        'Total # examples in gold: %d, # ex. in pred: %d (including english)' %
        (len(tydi_gold_dict), len(tydi_pred_dict)))

    f1_list, precision_list, recall_list = macro_avg_passage_scores
    print('*** Macro Over %d Languages, excluding English **' % len(f1_list))
    avg_passage_f1 = eval_utils.safe_average(f1_list)
    avg_passage_recall = eval_utils.safe_average(recall_list)
    avg_passage_precision = eval_utils.safe_average(precision_list)
    print('Passage F1:%.3f P:%.3f R:%3f' %
          (avg_passage_f1, avg_passage_precision, avg_passage_recall))
    print(
        get_latex_str(avg_passage_f1, avg_passage_precision,
                      avg_passage_recall))

    f1_list, precision_list, recall_list = macro_avg_minimal_scores

    avg_minimal_f1 = eval_utils.safe_average(f1_list)
    avg_minimal_recall = eval_utils.safe_average(recall_list)
    avg_minimal_precision = eval_utils.safe_average(precision_list)
    print('Minimal F1:%.3f P:%.3f R:%3f' %
          (avg_minimal_f1, avg_minimal_precision, avg_minimal_recall))
    print(
        get_latex_str(avg_minimal_f1, avg_minimal_precision,
                      avg_minimal_recall))
    print('*** / Aggregate Scores ****')

    aggregate_metrics = {
        'avg_passage_f1': avg_passage_f1,
        'avg_passage_recall': avg_passage_recall,
        'avg_passage_precision': avg_passage_precision,
        'avg_minimal_f1': avg_minimal_f1,
        'avg_minimal_recall': avg_minimal_recall,
        'avg_minimal_precision': avg_minimal_precision
    }
    print(json.dumps(aggregate_metrics))
Exemple #2
0
def score_minimal_answer(gold_label_list, pred_label,
                         minimal_non_null_threshold):
    """Scores a minimal answer.

  Outputs score against gold label that gives max F1.

  First decide if there is a gold minimal answer with
  FLAGS.minimal_non_null_threshold.
  If any of the gold label has "yes", or "no", and pred label predicted it
  correctly, than precision, recall, f1 is all 1.0.

  Args:
    gold_label_list: A list of TyDiLabel.
    pred_label: A single TyDiLabel.
    minimal_non_null_threshold: See FLAGS.minimal_non_null_threshold.

  Returns:
    gold_has_answer, pred_has_answer, (precision, recall, f1), score
  """

    # There is a gold minimal answer if gold_label_list not empty and non null
    # answers is over the threshold (sum over annotators).
    gold_has_answer = eval_utils.gold_has_minimal_answer(
        gold_label_list, minimal_non_null_threshold)

    if pred_label is None:
        return gold_has_answer, not gold_has_answer, (0, 0, 0), 0

    # There is a predicted minimal answer if the predicted minimal label span
    # is non-null or we have a specific predicted label (such as yes/no).
    pred_has_answer = ((not pred_label.minimal_answer_span.is_null_span())
                       or pred_label.yes_no_answer != 'none')

    # score is optional.
    score = pred_label.minimal_score
    # We find the closest (highest scoring) match between the system's predicted
    # minimal answer and one of the three gold annotations.
    max_f1 = 0.0
    max_precision = 0.0
    max_recall = 0.0

    # Both sides have minimal answers, which contains yes/no questions.
    if gold_has_answer and pred_has_answer:
        if pred_label.yes_no_answer != 'none':  # System predicted a yes/no answer.
            for gold_label in gold_label_list:
                if pred_label.yes_no_answer == gold_label.yes_no_answer:
                    max_f1 = 1.0
                    max_precision = 1.0
                    max_recall = 1.0
                    break
        else:
            for gold_label in gold_label_list:
                if gold_label.minimal_answer_span.is_null_span():
                    continue
                # Compute the *micro-F1* (a partial match score for this example).
                # We also compute a language-wise *macro-F1* later.
                precision, recall, f1 = eval_utils.compute_partial_match_scores(
                    gold_label.minimal_answer_span,
                    pred_label.minimal_answer_span)
                if f1 > max_f1:
                    max_f1 = f1
                    max_precision = precision
                    max_recall = recall

    return (gold_has_answer, pred_has_answer, (max_precision, max_recall,
                                               max_f1), score)