def main(_):
    nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                        n_threads=FLAGS.num_threads)

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict,
        nq_pred_dict,
        score_thres_long=FLAGS.score_thres_long,
        score_thres_short=FLAGS.score_thres_short)

    # reporting results
    print('*' * 20)

    scores = compute_final_f1(long_answer_stats, short_answer_stats)
    print('*' * 20)
    print('SCORES (n={}):'.format(scores['long-answer-n']))
    print('              F1     /  P      /  R')
    print('Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['long-answer-f1'], scores['long-answer-precision'],
        scores['long-answer-recall']))
    print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['short-answer-f1'], scores['short-answer-precision'],
        scores['short-answer-recall']))
    print('All answers  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['all-answer-f1'], scores['all-answer-precision'],
        scores['all-answer-recall']))
Exemple #2
0
def get_metrics_as_dict(gold_path, prediction_path, num_threads=10):
    """Library version of the end-to-end evaluation.
  Arguments:
    gold_path: Path to the gzip JSON data. For multiple files, should be a glob
      pattern (e.g. "/path/to/files-*")
    prediction_path: Path to the JSON prediction data.
    num_threads (10): Number of threads to use when parsing multiple files.
  Returns:
    metrics: A dictionary mapping string names to metric scores.
  """

    nq_gold_dict = util.read_annotation(gold_path, n_threads=num_threads)
    nq_pred_dict = util.read_prediction_json(prediction_path)
    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
Exemple #3
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), "cache")
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info("Reading from cache: %s", format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, "r"))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info("Caching gold data for next time to: %s",
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, "w"))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    if FLAGS.pretty_print:
        print("*" * 20)
        print("LONG ANSWER R@P TABLE:")
        print_r_at_p_table(long_answer_stats)
        print("*" * 20)
        print("SHORT ANSWER R@P TABLE:")
        print_r_at_p_table(short_answer_stats)

        scores = compute_final_f1(long_answer_stats, short_answer_stats)
        print("*" * 20)
        print("METRICS IGNORING SCORES (n={}):".format(
            scores["long-answer-n"]))
        print("              F1     /  P      /  R")
        print("Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}".format(
            scores["long-answer-f1"],
            scores["long-answer-precision"],
            scores["long-answer-recall"],
        ))
        print("Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format(
            scores["short-answer-f1"],
            scores["short-answer-precision"],
            scores["short-answer-recall"],
        ))
    else:
        metrics = get_metrics_with_answer_stats(long_answer_stats,
                                                short_answer_stats)
        print(json.dumps(metrics))
Exemple #4
0
def get_metrics_as_dict(gold_path, prediction_path):
    """Library version of the end-to-end evaluation.

    Arguments:
    gold_path: Path to the simplified JSONL data.
    prediction_path: Path to the JSON prediction data.
    num_threads (10): Number of threads to use when parsing multiple files.

    Returns:
    metrics: A dictionary mapping string names to metric scores.
    """

    nq_gold_dict = util.read_simplified_annotation(gold_path)
    nq_pred_dict = util.read_prediction_json(prediction_path)
    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
Exemple #5
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, 'r'))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for next time to: %s',
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, 'w'))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    # print("nq_gold_dict", nq_gold_dict)
    # print("nq_pred_dict", nq_pred_dict)
    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    if FLAGS.pretty_print:
        print('*' * 20)
        print('LONG ANSWER R@P TABLE:')
        print_r_at_p_table(long_answer_stats)
        print('*' * 20)
        print('SHORT ANSWER R@P TABLE:')
        print_r_at_p_table(short_answer_stats)

        scores = compute_final_f1(long_answer_stats, short_answer_stats)
        print('*' * 20)
        print('METRICS IGNORING SCORES (n={}):'.format(
            scores['long-answer-n']))
        print('              F1     /  P      /  R')
        print('Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
            scores['long-answer-f1'], scores['long-answer-precision'],
            scores['long-answer-recall']))
        print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
            scores['short-answer-f1'], scores['short-answer-precision'],
            scores['short-answer-recall']))
    else:
        metrics = get_metrics_with_answer_stats(long_answer_stats,
                                                short_answer_stats)
        print(json.dumps(metrics))
Exemple #6
0
def get_metrics_as_dict(gold_path, prediction_path):
  """Library version of the end-to-end evaluation.

  Arguments:
    gold_path: Path to a single JSONL data. Could be gzipped or not.
    prediction_path: Path to the JSON prediction data.

  Returns:
    metrics: A dictionary mapping string names to metric scores.
  """

  tydi_gold_dict = eval_utils.read_annotation(gold_path)
  tydi_pred_dict = eval_utils.read_prediction_json(prediction_path)

  passage_answer_stats, minimal_answer_stats = score_answers(
      tydi_gold_dict, tydi_pred_dict)

  return get_metrics_with_answer_stats(
      passage_answer_stats, minimal_answer_stats)
Exemple #7
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, 'r'))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for next time to: %s',
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, 'w'))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    ## input: nq_gold_dict, nq_pred_dict
    ## output: long, short score (with optional optimal threshold)

    print('final f1, final_p, final_r', get_f1(nq_gold_dict, nq_pred_dict))
Exemple #8
0
def main(_):
  cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
  if FLAGS.cache_gold_data and os.path.exists(cache_path):
    logging.info('Reading from cache: %s', format(cache_path))
    tydi_gold_dict = pickle.load(open(cache_path, 'r'))  # pytype: disable=wrong-arg-types
  else:
    tydi_gold_dict = eval_utils.read_annotation(FLAGS.gold_path)
    if FLAGS.cache_gold_data:
      logging.info('Caching gold data for future to: %s', format(cache_path))
      pickle.dump(tydi_gold_dict, open(cache_path, 'w'))  # pytype: disable=wrong-arg-types
  total_ans_count = 0
  count = 0

  for ans in tydi_gold_dict.values():
    count += 1
    gold_has_answer = eval_utils.gold_has_minimal_answer(
        ans, FLAGS.minimal_non_null_threshold)
    total_ans_count += gold_has_answer

  logging.info('%d has minimal answer', total_ans_count)
  logging.info('*' * 40)
  tydi_pred_dict = eval_utils.read_prediction_json(FLAGS.predictions_path)

  per_lang_gold = {}
  per_lang_pred = {}

  for ex_id, ex in tydi_gold_dict.items():
    if ex[0].language in per_lang_gold:
      per_lang_gold[ex[0].language][ex_id] = ex
    else:
      per_lang_gold[ex[0].language] = {ex_id: ex}
  for ex_id, ex in tydi_pred_dict.items():
    if ex.language in per_lang_pred:
      per_lang_pred[ex.language][ex_id] = ex
    else:
      per_lang_pred[ex.language] = {ex_id: ex}

  macro_avg_passage_scores = ([], [], [])
  macro_avg_minimal_scores = ([], [], [])

  language_list = [
      'english', 'arabic', 'bengali', 'finnish', 'indonesian', 'japanese',
      'swahili', 'korean', 'russian', 'telugu', 'thai'
  ]
  for lang in language_list:
    if lang in per_lang_pred:
      passage_answer_stats, minimal_answer_stats = score_answers(
          per_lang_gold[lang], per_lang_pred[lang])

      # Passage selection task
      opt_result, _ = compute_pr_curves(passage_answer_stats, targets=[0.5])
      f1, precision, recall, _ = opt_result
      if lang != 'english':
        macro_avg_passage_scores[0].append(f1)
        macro_avg_passage_scores[1].append(precision)
        macro_avg_passage_scores[2].append(recall)
      print('Passage & ' + lang + ' & ' + get_latex_str(f1, precision, recall))

      # Minimal answer span task
      opt_result, _ = compute_pr_curves(minimal_answer_stats, targets=[0.5])
      f1, precision, recall, _ = opt_result
      if lang != 'english':
        macro_avg_minimal_scores[0].append(f1)
        macro_avg_minimal_scores[1].append(precision)
        macro_avg_minimal_scores[2].append(recall)
      print('Minimal Answer & ' + lang + ' & ' +
            get_latex_str(f1, precision, recall))

      if FLAGS.pretty_print:
        print('*' * 20)
        print(lang)
        print('Language: %s (%d)' % (lang, len(per_lang_gold[lang])))
        print('*' * 20)
        print('PASSAGE ANSWER R@P TABLE:')
        print_r_at_p_table(passage_answer_stats)
        print('*' * 20)
        print('MINIMAL ANSWER R@P TABLE:')
        print_r_at_p_table(minimal_answer_stats)
      else:
        metrics = get_metrics_with_answer_stats(passage_answer_stats,
                                                minimal_answer_stats)
        print(json.dumps(metrics))

  print('Total # examples in gold: %d, # ex. in pred: %d (including english)' %
        (len(tydi_gold_dict), len(tydi_pred_dict)))

  f1_list, precision_list, recall_list = macro_avg_passage_scores
  print('*** Macro Over %d Languages, excluding English **' % len(f1_list))
  avg_passage_f1 = eval_utils.safe_average(f1_list)
  avg_passage_recall = eval_utils.safe_average(recall_list)
  avg_passage_precision = eval_utils.safe_average(precision_list)
  print('Passage F1:%.3f P:%.3f R:%3f' %
        (avg_passage_f1, avg_passage_precision, avg_passage_recall))
  print(get_latex_str(
      avg_passage_f1, avg_passage_precision, avg_passage_recall))

  f1_list, precision_list, recall_list = macro_avg_minimal_scores

  avg_minimal_f1 = eval_utils.safe_average(f1_list)
  avg_minimal_recall = eval_utils.safe_average(recall_list)
  avg_minimal_precision = eval_utils.safe_average(precision_list)
  print('Minimal F1:%.3f P:%.3f R:%3f' %
        (avg_minimal_f1, avg_minimal_precision, avg_minimal_recall))
  print(get_latex_str(
      avg_minimal_f1, avg_minimal_precision, avg_minimal_recall))
  print('*** / Aggregate Scores ****')

  aggregate_metrics = {'avg_passage_f1': avg_passage_f1,
                       'avg_passage_recall': avg_passage_recall,
                       'avg_passage_precision': avg_passage_precision,
                       'avg_minimal_f1': avg_minimal_f1,
                       'avg_minimal_recall': avg_minimal_recall,
                       'avg_minimal_precision': avg_minimal_precision}
  print(json.dumps(aggregate_metrics))