def summarize_type_scores(): """ Calculate the overall type scores from the accumulated statistics. :return: """ per_type_precision = {} per_type_recall = {} per_type_f1 = {} for mention_type, num_gold in EvalState.per_type_num_gold.iteritems(): tp = utils.get_or_else(EvalState.per_type_tp, mention_type, 0) num_sys = utils.get_or_else(EvalState.per_type_num_response, mention_type, 0) prec = safe_div(tp, num_sys) recall = safe_div(tp, num_gold) f_score = safe_div(2 * prec * recall, prec + recall) per_type_precision[mention_type] = prec per_type_recall[mention_type] = recall per_type_f1[mention_type] = f_score return per_type_precision, per_type_recall, per_type_f1
def print_eval_results(mention_eval_out, all_attribute_combinations): total_gold_mentions = 0 total_system_mentions = 0 valid_docs = 0 plain_global_scores = [0.0] * 4 attribute_based_global_scores = [ [0.0] * 4 for _ in xrange(len(all_attribute_combinations)) ] doc_id_width = get_cell_width(EvalState.doc_mention_scores) mention_eval_out.write( "========Document Mention Detection Results==========\n") small_header_item = "Prec \tRec \tF1 " attribute_header_list = get_combined_attribute_header( all_attribute_combinations, len(small_header_item)) small_headers = [small_header_item] * (len(all_attribute_combinations) + 1) mention_eval_out.write( pad_char_before_until("", doc_id_width) + "\t" + "\t|\t".join(attribute_header_list) + "\n") mention_eval_out.write( pad_char_before_until("Doc ID", doc_id_width) + "\t" + "\t|\t".join(small_headers) + "\n") for (tp, fp, attribute_based_counts, num_gold_mentions, num_sys_mentions, docId) in EvalState.doc_mention_scores: tp *= 100 fp *= 100 prec = safe_div(tp, num_sys_mentions) recall = safe_div(tp, num_gold_mentions) doc_f1 = compute_f1(prec, recall) attribute_based_doc_scores = [] for comb_index, comb in enumerate(all_attribute_combinations): counts = attribute_based_counts[comb_index] attr_tp = counts[0] * 100 attr_fp = counts[1] * 100 attr_prec = safe_div(attr_tp, num_sys_mentions) attr_recall = safe_div(attr_tp, num_gold_mentions) attr_f1 = compute_f1(attr_prec, attr_recall) attribute_based_doc_scores.append( "%.2f\t%.2f\t%.2f" % (attr_prec, attr_recall, attr_f1)) for score_index, score in enumerate( [attr_tp, attr_fp, attr_prec, attr_recall]): if not math.isnan(score): attribute_based_global_scores[comb_index][ score_index] += score mention_eval_out.write( "%s\t%.2f\t%.2f\t%.2f\t|\t%s\n" % (pad_char_before_until(docId, doc_id_width), prec, recall, doc_f1, "\t|\t".join(attribute_based_doc_scores))) # Compute the denominators: # 1. Number of valid doc does not include gold standard files that contains no mentions. # 2. Gold mention count and system mention count are accumulated, used to compute prec, recall. if math.isnan(recall): # gold produce no mentions, do nothing pass elif math.isnan(prec): # system produce no mentions, accumulate denominator logger.warning( 'System produce nothing for document [%s], assigning 0 scores' % docId) valid_docs += 1 total_gold_mentions += num_gold_mentions else: valid_docs += 1 total_gold_mentions += num_gold_mentions total_system_mentions += num_sys_mentions for score_index, score in enumerate([tp, fp, prec, recall]): plain_global_scores[score_index] += score if len(EvalState.doc_coref_scores) > 0: mention_eval_out.write( "\n\n========Document Mention Corefrence Results (CoNLL Average)==========\n" ) for coref_score, doc_id in EvalState.doc_coref_scores: mention_eval_out.write("%s\t%.2f\n" % (doc_id, coref_score)) per_type_precision, per_type_recall, per_type_f1 = summarize_type_scores() mention_eval_out.write("\n\n========Mention Type Results==========\n") if len(per_type_f1) > 0: max_type_name_width = len(max(per_type_f1.keys(), key=len)) mention_eval_out.write( "%s\tPrec\tRec\tF1\t#Gold\t#Sys\n" % pad_char_before_until("Type", max_type_name_width)) for mention_type, f1 in sorted(per_type_f1.items()): mention_eval_out.write( "%s\t%.2f\t%.2f\t%.2f\t%d\t%d\n" % (pad_char_before_until(mention_type, max_type_name_width), utils.nan_as_zero( utils.get_or_else(per_type_precision, mention_type, 0)), utils.nan_as_zero( utils.get_or_else(per_type_recall, mention_type, 0)), utils.nan_as_zero( utils.get_or_else(per_type_f1, mention_type, 0)), utils.nan_as_zero( utils.get_or_else(EvalState.per_type_num_gold, mention_type, 0)), utils.nan_as_zero( utils.get_or_else(EvalState.per_type_num_response, mention_type, 0)))) # Use the denominators above to calculate the averages. plain_average_scores = get_averages(plain_global_scores, total_gold_mentions, total_system_mentions, valid_docs) mention_eval_out.write( "\n=======Final Mention Detection Results=========\n") max_attribute_name_width = len(max(attribute_header_list, key=len)) attributes_name_header = pad_char_before_until("Attributes", max_attribute_name_width) final_result_big_header = ["Micro Average", "Macro Average"] mention_eval_out.write( pad_char_before_until("", max_attribute_name_width, " ") + "\t" + "\t".join([ pad_char_before_until(h, len(small_header_item)) for h in final_result_big_header ]) + "\n") mention_eval_out.write(attributes_name_header + "\t" + "\t".join([small_header_item] * 2) + "\n") mention_eval_out.write( pad_char_before_until(attribute_header_list[0], max_attribute_name_width) + "\t" + "\t".join("%.2f" % f for f in plain_average_scores) + "\n") for attr_index, attr_based_score in enumerate( attribute_based_global_scores): attr_average_scores = get_averages(attr_based_score, total_gold_mentions, total_system_mentions, valid_docs) mention_eval_out.write( pad_char_before_until(attribute_header_list[attr_index + 1], max_attribute_name_width) + "\t" + "\t".join("%.2f" % f for f in attr_average_scores) + "\n") if len(EvalState.overall_coref_scores) > 0: mention_eval_out.write( "\n=======Final Mention Coreference Results=========\n") conll_sum = 0.0 num_metric = 0 for metric, score in EvalState.overall_coref_scores.iteritems(): formatter = "Metric : %s\tScore\t%.2f\n" if metric in Config.skipped_metrics: formatter = "Metric : %s\tScore\t%.2f *\n" else: conll_sum += score num_metric += 1 mention_eval_out.write(formatter % (metric, score)) mention_eval_out.write("Overall Average CoNLL score\t%.2f\n" % (conll_sum / num_metric)) mention_eval_out.write( "\n* Score not included for final CoNLL score.\n") if Config.script_result_dir is not None: mention_eval_out.write("\n") for eval_type in Config.script_types + ["All"]: for filename in os.listdir( os.path.join(Config.script_result_dir, eval_type)): script_eval_path = os.path.join(Config.script_result_dir, eval_type, filename) if os.path.isfile(script_eval_path): if filename == Config.script_out: with open(script_eval_path, 'r') as out: mention_eval_out.write( "=======Event Sequencing Results for %s =======\n" % eval_type) for l in out: mention_eval_out.write(l) if Config.eval_cluster_level_links: if filename == Config.script_out_cluster: with open(script_eval_path, 'r') as out: mention_eval_out.write( "=======Event Sequencing Results for %s (Cluster) =======\n" % eval_type) for l in out: mention_eval_out.write(l) if mention_eval_out is not None: mention_eval_out.flush() if not mention_eval_out == sys.stdout: mention_eval_out.close()
def print_eval_results(mention_eval_out, all_attribute_combinations): total_gold_mentions = 0 total_system_mentions = 0 valid_docs = 0 plain_global_scores = [0.0] * 4 attribute_based_global_scores = [[0.0] * 4 for _ in xrange(len(all_attribute_combinations))] doc_id_width = get_cell_width(EvalState.doc_mention_scores) mention_eval_out.write("========Document Mention Detection Results==========\n") small_header_item = "Prec \tRec \tF1 " attribute_header_list = get_combined_attribute_header(all_attribute_combinations, len(small_header_item)) small_headers = [small_header_item] * (len(all_attribute_combinations) + 1) mention_eval_out.write(pad_char_before_until("", doc_id_width) + "\t" + "\t|\t".join(attribute_header_list) + "\n") mention_eval_out.write(pad_char_before_until("Doc ID", doc_id_width) + "\t" + "\t|\t".join(small_headers) + "\n") for (tp, fp, attribute_based_counts, num_gold_mentions, num_sys_mentions, docId) in EvalState.doc_mention_scores: tp *= 100 fp *= 100 prec = safe_div(tp, num_sys_mentions) recall = safe_div(tp, num_gold_mentions) doc_f1 = compute_f1(prec, recall) attribute_based_doc_scores = [] for comb_index, comb in enumerate(all_attribute_combinations): counts = attribute_based_counts[comb_index] attr_tp = counts[0] * 100 attr_fp = counts[1] * 100 attr_prec = safe_div(attr_tp, num_sys_mentions) attr_recall = safe_div(attr_tp, num_gold_mentions) attr_f1 = compute_f1(attr_prec, attr_recall) attribute_based_doc_scores.append("%.2f\t%.2f\t%.2f" % (attr_prec, attr_recall, attr_f1)) for score_index, score in enumerate([attr_tp, attr_fp, attr_prec, attr_recall]): if not math.isnan(score): attribute_based_global_scores[comb_index][score_index] += score mention_eval_out.write( "%s\t%.2f\t%.2f\t%.2f\t|\t%s\n" % ( pad_char_before_until(docId, doc_id_width), prec, recall, doc_f1, "\t|\t".join(attribute_based_doc_scores))) # Compute the denominators: # 1. Number of valid doc does not include gold standard files that contains no mentions. # 2. Gold mention count and system mention count are accumulated, used to compute prec, recall. if math.isnan(recall): # gold produce no mentions, do nothing pass elif math.isnan(prec): # system produce no mentions, accumulate denominator logger.warning('System produce nothing for document [%s], assigning 0 scores' % docId) valid_docs += 1 total_gold_mentions += num_gold_mentions else: valid_docs += 1 total_gold_mentions += num_gold_mentions total_system_mentions += num_sys_mentions for score_index, score in enumerate([tp, fp, prec, recall]): plain_global_scores[score_index] += score if len(EvalState.doc_coref_scores) > 0: mention_eval_out.write("\n\n========Document Mention Corefrence Results (CoNLL Average)==========\n") for coref_score, doc_id in EvalState.doc_coref_scores: mention_eval_out.write("%s\t%.2f\n" % (doc_id, coref_score)) per_type_precision, per_type_recall, per_type_f1 = summarize_type_scores() mention_eval_out.write("\n\n========Mention Type Results==========\n") if len(per_type_f1) > 0: max_type_name_width = len(max(per_type_f1.keys(), key=len)) mention_eval_out.write("%s\tPrec\tRec\tF1\t#Gold\t#Sys\n" % pad_char_before_until("Type", max_type_name_width)) for mention_type, f1 in sorted(per_type_f1.items()): mention_eval_out.write("%s\t%.2f\t%.2f\t%.2f\t%d\t%d\n" % ( pad_char_before_until(mention_type, max_type_name_width), utils.nan_as_zero(utils.get_or_else(per_type_precision, mention_type, 0)), utils.nan_as_zero(utils.get_or_else(per_type_recall, mention_type, 0)), utils.nan_as_zero(utils.get_or_else(per_type_f1, mention_type, 0)), utils.nan_as_zero(utils.get_or_else(EvalState.per_type_num_gold, mention_type, 0)), utils.nan_as_zero(utils.get_or_else(EvalState.per_type_num_response, mention_type, 0)) )) # Use the denominators above to calculate the averages. plain_average_scores = get_averages(plain_global_scores, total_gold_mentions, total_system_mentions, valid_docs) mention_eval_out.write("\n=======Final Mention Detection Results=========\n") max_attribute_name_width = len(max(attribute_header_list, key=len)) attributes_name_header = pad_char_before_until("Attributes", max_attribute_name_width) final_result_big_header = ["Micro Average", "Macro Average"] mention_eval_out.write( pad_char_before_until("", max_attribute_name_width, " ") + "\t" + "\t".join( [pad_char_before_until(h, len(small_header_item)) for h in final_result_big_header]) + "\n") mention_eval_out.write(attributes_name_header + "\t" + "\t".join([small_header_item] * 2) + "\n") mention_eval_out.write(pad_char_before_until(attribute_header_list[0], max_attribute_name_width) + "\t" + "\t".join( "%.2f" % f for f in plain_average_scores) + "\n") for attr_index, attr_based_score in enumerate(attribute_based_global_scores): attr_average_scores = get_averages(attr_based_score, total_gold_mentions, total_system_mentions, valid_docs) mention_eval_out.write( pad_char_before_until(attribute_header_list[attr_index + 1], max_attribute_name_width) + "\t" + "\t".join( "%.2f" % f for f in attr_average_scores) + "\n") if len(EvalState.overall_coref_scores) > 0: mention_eval_out.write("\n=======Final Mention Coreference Results=========\n") conll_sum = 0.0 num_metric = 0 for metric, score in EvalState.overall_coref_scores.iteritems(): formatter = "Metric : %s\tScore\t%.2f\n" if metric in Config.skipped_metrics: formatter = "Metric : %s\tScore\t%.2f *\n" else: conll_sum += score num_metric += 1 mention_eval_out.write(formatter % (metric, score)) mention_eval_out.write( "Overall Average CoNLL score\t%.2f\n" % (conll_sum / num_metric)) mention_eval_out.write("\n* Score not included for final CoNLL score.\n") if Config.temporal_result_dir is not None: mention_eval_out.write("\n") temporal_output = os.path.join(Config.temporal_result_dir, Config.temporal_out) with open(temporal_output, 'r') as f: for l in f: mention_eval_out.write(l) if mention_eval_out is not None: mention_eval_out.flush() if not mention_eval_out == sys.stdout: mention_eval_out.close()