def main(_): nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict, score_thres_long=FLAGS.score_thres_long, score_thres_short=FLAGS.score_thres_short) # reporting results print('*' * 20) scores = compute_final_f1(long_answer_stats, short_answer_stats) print('*' * 20) print('SCORES (n={}):'.format(scores['long-answer-n'])) print(' F1 / P / R') print('Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['long-answer-f1'], scores['long-answer-precision'], scores['long-answer-recall'])) print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['short-answer-f1'], scores['short-answer-precision'], scores['short-answer-recall'])) print('All answers {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['all-answer-f1'], scores['all-answer-precision'], scores['all-answer-recall']))
def get_metrics_as_dict(gold_path, prediction_path, num_threads=10): """Library version of the end-to-end evaluation. Arguments: gold_path: Path to the gzip JSON data. For multiple files, should be a glob pattern (e.g. "/path/to/files-*") prediction_path: Path to the JSON prediction data. num_threads (10): Number of threads to use when parsing multiple files. Returns: metrics: A dictionary mapping string names to metric scores. """ nq_gold_dict = util.read_annotation(gold_path, n_threads=num_threads) nq_pred_dict = util.read_prediction_json(prediction_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), "cache") if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info("Reading from cache: %s", format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, "r")) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info("Caching gold data for next time to: %s", format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, "w")) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) if FLAGS.pretty_print: print("*" * 20) print("LONG ANSWER R@P TABLE:") print_r_at_p_table(long_answer_stats) print("*" * 20) print("SHORT ANSWER R@P TABLE:") print_r_at_p_table(short_answer_stats) scores = compute_final_f1(long_answer_stats, short_answer_stats) print("*" * 20) print("METRICS IGNORING SCORES (n={}):".format( scores["long-answer-n"])) print(" F1 / P / R") print("Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format( scores["long-answer-f1"], scores["long-answer-precision"], scores["long-answer-recall"], )) print("Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format( scores["short-answer-f1"], scores["short-answer-precision"], scores["short-answer-recall"], )) else: metrics = get_metrics_with_answer_stats(long_answer_stats, short_answer_stats) print(json.dumps(metrics))
def get_metrics_as_dict(gold_path, prediction_path): """Library version of the end-to-end evaluation. Arguments: gold_path: Path to the simplified JSONL data. prediction_path: Path to the JSON prediction data. num_threads (10): Number of threads to use when parsing multiple files. Returns: metrics: A dictionary mapping string names to metric scores. """ nq_gold_dict = util.read_simplified_annotation(gold_path) nq_pred_dict = util.read_prediction_json(prediction_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, 'r')) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info('Caching gold data for next time to: %s', format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, 'w')) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) # print("nq_gold_dict", nq_gold_dict) # print("nq_pred_dict", nq_pred_dict) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) if FLAGS.pretty_print: print('*' * 20) print('LONG ANSWER R@P TABLE:') print_r_at_p_table(long_answer_stats) print('*' * 20) print('SHORT ANSWER R@P TABLE:') print_r_at_p_table(short_answer_stats) scores = compute_final_f1(long_answer_stats, short_answer_stats) print('*' * 20) print('METRICS IGNORING SCORES (n={}):'.format( scores['long-answer-n'])) print(' F1 / P / R') print('Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['long-answer-f1'], scores['long-answer-precision'], scores['long-answer-recall'])) print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['short-answer-f1'], scores['short-answer-precision'], scores['short-answer-recall'])) else: metrics = get_metrics_with_answer_stats(long_answer_stats, short_answer_stats) print(json.dumps(metrics))
def get_metrics_as_dict(gold_path, prediction_path): """Library version of the end-to-end evaluation. Arguments: gold_path: Path to a single JSONL data. Could be gzipped or not. prediction_path: Path to the JSON prediction data. Returns: metrics: A dictionary mapping string names to metric scores. """ tydi_gold_dict = eval_utils.read_annotation(gold_path) tydi_pred_dict = eval_utils.read_prediction_json(prediction_path) passage_answer_stats, minimal_answer_stats = score_answers( tydi_gold_dict, tydi_pred_dict) return get_metrics_with_answer_stats( passage_answer_stats, minimal_answer_stats)
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, 'r')) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info('Caching gold data for next time to: %s', format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, 'w')) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) ## input: nq_gold_dict, nq_pred_dict ## output: long, short score (with optional optimal threshold) print('final f1, final_p, final_r', get_f1(nq_gold_dict, nq_pred_dict))
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) tydi_gold_dict = pickle.load(open(cache_path, 'r')) # pytype: disable=wrong-arg-types else: tydi_gold_dict = eval_utils.read_annotation(FLAGS.gold_path) if FLAGS.cache_gold_data: logging.info('Caching gold data for future to: %s', format(cache_path)) pickle.dump(tydi_gold_dict, open(cache_path, 'w')) # pytype: disable=wrong-arg-types total_ans_count = 0 count = 0 for ans in tydi_gold_dict.values(): count += 1 gold_has_answer = eval_utils.gold_has_minimal_answer( ans, FLAGS.minimal_non_null_threshold) total_ans_count += gold_has_answer logging.info('%d has minimal answer', total_ans_count) logging.info('*' * 40) tydi_pred_dict = eval_utils.read_prediction_json(FLAGS.predictions_path) per_lang_gold = {} per_lang_pred = {} for ex_id, ex in tydi_gold_dict.items(): if ex[0].language in per_lang_gold: per_lang_gold[ex[0].language][ex_id] = ex else: per_lang_gold[ex[0].language] = {ex_id: ex} for ex_id, ex in tydi_pred_dict.items(): if ex.language in per_lang_pred: per_lang_pred[ex.language][ex_id] = ex else: per_lang_pred[ex.language] = {ex_id: ex} macro_avg_passage_scores = ([], [], []) macro_avg_minimal_scores = ([], [], []) language_list = [ 'english', 'arabic', 'bengali', 'finnish', 'indonesian', 'japanese', 'swahili', 'korean', 'russian', 'telugu', 'thai' ] for lang in language_list: if lang in per_lang_pred: passage_answer_stats, minimal_answer_stats = score_answers( per_lang_gold[lang], per_lang_pred[lang]) # Passage selection task opt_result, _ = compute_pr_curves(passage_answer_stats, targets=[0.5]) f1, precision, recall, _ = opt_result if lang != 'english': macro_avg_passage_scores[0].append(f1) macro_avg_passage_scores[1].append(precision) macro_avg_passage_scores[2].append(recall) print('Passage & ' + lang + ' & ' + get_latex_str(f1, precision, recall)) # Minimal answer span task opt_result, _ = compute_pr_curves(minimal_answer_stats, targets=[0.5]) f1, precision, recall, _ = opt_result if lang != 'english': macro_avg_minimal_scores[0].append(f1) macro_avg_minimal_scores[1].append(precision) macro_avg_minimal_scores[2].append(recall) print('Minimal Answer & ' + lang + ' & ' + get_latex_str(f1, precision, recall)) if FLAGS.pretty_print: print('*' * 20) print(lang) print('Language: %s (%d)' % (lang, len(per_lang_gold[lang]))) print('*' * 20) print('PASSAGE ANSWER R@P TABLE:') print_r_at_p_table(passage_answer_stats) print('*' * 20) print('MINIMAL ANSWER R@P TABLE:') print_r_at_p_table(minimal_answer_stats) else: metrics = get_metrics_with_answer_stats(passage_answer_stats, minimal_answer_stats) print(json.dumps(metrics)) print('Total # examples in gold: %d, # ex. in pred: %d (including english)' % (len(tydi_gold_dict), len(tydi_pred_dict))) f1_list, precision_list, recall_list = macro_avg_passage_scores print('*** Macro Over %d Languages, excluding English **' % len(f1_list)) avg_passage_f1 = eval_utils.safe_average(f1_list) avg_passage_recall = eval_utils.safe_average(recall_list) avg_passage_precision = eval_utils.safe_average(precision_list) print('Passage F1:%.3f P:%.3f R:%3f' % (avg_passage_f1, avg_passage_precision, avg_passage_recall)) print(get_latex_str( avg_passage_f1, avg_passage_precision, avg_passage_recall)) f1_list, precision_list, recall_list = macro_avg_minimal_scores avg_minimal_f1 = eval_utils.safe_average(f1_list) avg_minimal_recall = eval_utils.safe_average(recall_list) avg_minimal_precision = eval_utils.safe_average(precision_list) print('Minimal F1:%.3f P:%.3f R:%3f' % (avg_minimal_f1, avg_minimal_precision, avg_minimal_recall)) print(get_latex_str( avg_minimal_f1, avg_minimal_precision, avg_minimal_recall)) print('*** / Aggregate Scores ****') aggregate_metrics = {'avg_passage_f1': avg_passage_f1, 'avg_passage_recall': avg_passage_recall, 'avg_passage_precision': avg_passage_precision, 'avg_minimal_f1': avg_minimal_f1, 'avg_minimal_recall': avg_minimal_recall, 'avg_minimal_precision': avg_minimal_precision} print(json.dumps(aggregate_metrics))