Exemple #1
0
def summarize_type_scores():
    """
    Calculate the overall type scores from the accumulated statistics.
    :return:
    """
    per_type_precision = {}
    per_type_recall = {}
    per_type_f1 = {}

    for mention_type, num_gold in EvalState.per_type_num_gold.iteritems():
        tp = utils.get_or_else(EvalState.per_type_tp, mention_type, 0)
        num_sys = utils.get_or_else(EvalState.per_type_num_response, mention_type, 0)
        prec = safe_div(tp, num_sys)
        recall = safe_div(tp, num_gold)
        f_score = safe_div(2 * prec * recall, prec + recall)
        per_type_precision[mention_type] = prec
        per_type_recall[mention_type] = recall
        per_type_f1[mention_type] = f_score

    return per_type_precision, per_type_recall, per_type_f1
Exemple #2
0
def summarize_type_scores():
    """
    Calculate the overall type scores from the accumulated statistics.
    :return:
    """
    per_type_precision = {}
    per_type_recall = {}
    per_type_f1 = {}

    for mention_type, num_gold in EvalState.per_type_num_gold.iteritems():
        tp = utils.get_or_else(EvalState.per_type_tp, mention_type, 0)
        num_sys = utils.get_or_else(EvalState.per_type_num_response, mention_type, 0)
        prec = safe_div(tp, num_sys)
        recall = safe_div(tp, num_gold)
        f_score = safe_div(2 * prec * recall, prec + recall)
        per_type_precision[mention_type] = prec
        per_type_recall[mention_type] = recall
        per_type_f1[mention_type] = f_score

    return per_type_precision, per_type_recall, per_type_f1
Exemple #3
0
def print_eval_results(mention_eval_out, all_attribute_combinations):
    total_gold_mentions = 0
    total_system_mentions = 0
    valid_docs = 0

    plain_global_scores = [0.0] * 4
    attribute_based_global_scores = [
        [0.0] * 4 for _ in xrange(len(all_attribute_combinations))
    ]

    doc_id_width = get_cell_width(EvalState.doc_mention_scores)

    mention_eval_out.write(
        "========Document Mention Detection Results==========\n")
    small_header_item = "Prec  \tRec  \tF1   "
    attribute_header_list = get_combined_attribute_header(
        all_attribute_combinations, len(small_header_item))
    small_headers = [small_header_item] * (len(all_attribute_combinations) + 1)
    mention_eval_out.write(
        pad_char_before_until("", doc_id_width) + "\t" +
        "\t|\t".join(attribute_header_list) + "\n")
    mention_eval_out.write(
        pad_char_before_until("Doc ID", doc_id_width) + "\t" +
        "\t|\t".join(small_headers) + "\n")

    for (tp, fp, attribute_based_counts, num_gold_mentions, num_sys_mentions,
         docId) in EvalState.doc_mention_scores:
        tp *= 100
        fp *= 100
        prec = safe_div(tp, num_sys_mentions)
        recall = safe_div(tp, num_gold_mentions)
        doc_f1 = compute_f1(prec, recall)

        attribute_based_doc_scores = []

        for comb_index, comb in enumerate(all_attribute_combinations):
            counts = attribute_based_counts[comb_index]
            attr_tp = counts[0] * 100
            attr_fp = counts[1] * 100
            attr_prec = safe_div(attr_tp, num_sys_mentions)
            attr_recall = safe_div(attr_tp, num_gold_mentions)
            attr_f1 = compute_f1(attr_prec, attr_recall)

            attribute_based_doc_scores.append(
                "%.2f\t%.2f\t%.2f" % (attr_prec, attr_recall, attr_f1))

            for score_index, score in enumerate(
                [attr_tp, attr_fp, attr_prec, attr_recall]):
                if not math.isnan(score):
                    attribute_based_global_scores[comb_index][
                        score_index] += score

        mention_eval_out.write(
            "%s\t%.2f\t%.2f\t%.2f\t|\t%s\n" %
            (pad_char_before_until(docId, doc_id_width), prec, recall, doc_f1,
             "\t|\t".join(attribute_based_doc_scores)))

        # Compute the denominators:
        # 1. Number of valid doc does not include gold standard files that contains no mentions.
        # 2. Gold mention count and system mention count are accumulated, used to compute prec, recall.
        if math.isnan(recall):
            # gold produce no mentions, do nothing
            pass
        elif math.isnan(prec):
            # system produce no mentions, accumulate denominator
            logger.warning(
                'System produce nothing for document [%s], assigning 0 scores'
                % docId)
            valid_docs += 1
            total_gold_mentions += num_gold_mentions
        else:
            valid_docs += 1
            total_gold_mentions += num_gold_mentions
            total_system_mentions += num_sys_mentions

            for score_index, score in enumerate([tp, fp, prec, recall]):
                plain_global_scores[score_index] += score

    if len(EvalState.doc_coref_scores) > 0:
        mention_eval_out.write(
            "\n\n========Document Mention Corefrence Results (CoNLL Average)==========\n"
        )
        for coref_score, doc_id in EvalState.doc_coref_scores:
            mention_eval_out.write("%s\t%.2f\n" % (doc_id, coref_score))

    per_type_precision, per_type_recall, per_type_f1 = summarize_type_scores()

    mention_eval_out.write("\n\n========Mention Type Results==========\n")
    if len(per_type_f1) > 0:
        max_type_name_width = len(max(per_type_f1.keys(), key=len))
        mention_eval_out.write(
            "%s\tPrec\tRec\tF1\t#Gold\t#Sys\n" %
            pad_char_before_until("Type", max_type_name_width))
        for mention_type, f1 in sorted(per_type_f1.items()):
            mention_eval_out.write(
                "%s\t%.2f\t%.2f\t%.2f\t%d\t%d\n" %
                (pad_char_before_until(mention_type, max_type_name_width),
                 utils.nan_as_zero(
                     utils.get_or_else(per_type_precision, mention_type, 0)),
                 utils.nan_as_zero(
                     utils.get_or_else(per_type_recall, mention_type, 0)),
                 utils.nan_as_zero(
                     utils.get_or_else(per_type_f1, mention_type, 0)),
                 utils.nan_as_zero(
                     utils.get_or_else(EvalState.per_type_num_gold,
                                       mention_type, 0)),
                 utils.nan_as_zero(
                     utils.get_or_else(EvalState.per_type_num_response,
                                       mention_type, 0))))

    # Use the denominators above to calculate the averages.
    plain_average_scores = get_averages(plain_global_scores,
                                        total_gold_mentions,
                                        total_system_mentions, valid_docs)

    mention_eval_out.write(
        "\n=======Final Mention Detection Results=========\n")
    max_attribute_name_width = len(max(attribute_header_list, key=len))
    attributes_name_header = pad_char_before_until("Attributes",
                                                   max_attribute_name_width)

    final_result_big_header = ["Micro Average", "Macro Average"]

    mention_eval_out.write(
        pad_char_before_until("", max_attribute_name_width, " ") + "\t" +
        "\t".join([
            pad_char_before_until(h, len(small_header_item))
            for h in final_result_big_header
        ]) + "\n")
    mention_eval_out.write(attributes_name_header + "\t" +
                           "\t".join([small_header_item] * 2) + "\n")
    mention_eval_out.write(
        pad_char_before_until(attribute_header_list[0],
                              max_attribute_name_width) + "\t" +
        "\t".join("%.2f" % f for f in plain_average_scores) + "\n")
    for attr_index, attr_based_score in enumerate(
            attribute_based_global_scores):
        attr_average_scores = get_averages(attr_based_score,
                                           total_gold_mentions,
                                           total_system_mentions, valid_docs)
        mention_eval_out.write(
            pad_char_before_until(attribute_header_list[attr_index + 1],
                                  max_attribute_name_width) + "\t" +
            "\t".join("%.2f" % f for f in attr_average_scores) + "\n")

    if len(EvalState.overall_coref_scores) > 0:
        mention_eval_out.write(
            "\n=======Final Mention Coreference Results=========\n")
        conll_sum = 0.0
        num_metric = 0
        for metric, score in EvalState.overall_coref_scores.iteritems():
            formatter = "Metric : %s\tScore\t%.2f\n"
            if metric in Config.skipped_metrics:
                formatter = "Metric : %s\tScore\t%.2f *\n"
            else:
                conll_sum += score
                num_metric += 1
            mention_eval_out.write(formatter % (metric, score))
        mention_eval_out.write("Overall Average CoNLL score\t%.2f\n" %
                               (conll_sum / num_metric))
        mention_eval_out.write(
            "\n* Score not included for final CoNLL score.\n")

    if Config.script_result_dir is not None:
        mention_eval_out.write("\n")

        for eval_type in Config.script_types + ["All"]:
            for filename in os.listdir(
                    os.path.join(Config.script_result_dir, eval_type)):
                script_eval_path = os.path.join(Config.script_result_dir,
                                                eval_type, filename)
                if os.path.isfile(script_eval_path):
                    if filename == Config.script_out:
                        with open(script_eval_path, 'r') as out:
                            mention_eval_out.write(
                                "=======Event Sequencing Results for %s =======\n"
                                % eval_type)
                            for l in out:
                                mention_eval_out.write(l)

                    if Config.eval_cluster_level_links:
                        if filename == Config.script_out_cluster:
                            with open(script_eval_path, 'r') as out:
                                mention_eval_out.write(
                                    "=======Event Sequencing Results for %s (Cluster) =======\n"
                                    % eval_type)
                                for l in out:
                                    mention_eval_out.write(l)

    if mention_eval_out is not None:
        mention_eval_out.flush()
    if not mention_eval_out == sys.stdout:
        mention_eval_out.close()
Exemple #4
0
def print_eval_results(mention_eval_out, all_attribute_combinations):
    total_gold_mentions = 0
    total_system_mentions = 0
    valid_docs = 0

    plain_global_scores = [0.0] * 4
    attribute_based_global_scores = [[0.0] * 4 for _ in xrange(len(all_attribute_combinations))]

    doc_id_width = get_cell_width(EvalState.doc_mention_scores)

    mention_eval_out.write("========Document Mention Detection Results==========\n")
    small_header_item = "Prec  \tRec  \tF1   "
    attribute_header_list = get_combined_attribute_header(all_attribute_combinations, len(small_header_item))
    small_headers = [small_header_item] * (len(all_attribute_combinations) + 1)
    mention_eval_out.write(pad_char_before_until("", doc_id_width) + "\t" + "\t|\t".join(attribute_header_list) + "\n")
    mention_eval_out.write(pad_char_before_until("Doc ID", doc_id_width) + "\t" + "\t|\t".join(small_headers) + "\n")

    for (tp, fp, attribute_based_counts, num_gold_mentions, num_sys_mentions, docId) in EvalState.doc_mention_scores:
        tp *= 100
        fp *= 100
        prec = safe_div(tp, num_sys_mentions)
        recall = safe_div(tp, num_gold_mentions)
        doc_f1 = compute_f1(prec, recall)

        attribute_based_doc_scores = []

        for comb_index, comb in enumerate(all_attribute_combinations):
            counts = attribute_based_counts[comb_index]
            attr_tp = counts[0] * 100
            attr_fp = counts[1] * 100
            attr_prec = safe_div(attr_tp, num_sys_mentions)
            attr_recall = safe_div(attr_tp, num_gold_mentions)
            attr_f1 = compute_f1(attr_prec, attr_recall)

            attribute_based_doc_scores.append("%.2f\t%.2f\t%.2f" % (attr_prec, attr_recall, attr_f1))

            for score_index, score in enumerate([attr_tp, attr_fp, attr_prec, attr_recall]):
                if not math.isnan(score):
                    attribute_based_global_scores[comb_index][score_index] += score

        mention_eval_out.write(
            "%s\t%.2f\t%.2f\t%.2f\t|\t%s\n" % (
                pad_char_before_until(docId, doc_id_width), prec, recall, doc_f1,
                "\t|\t".join(attribute_based_doc_scores)))

        # Compute the denominators:
        # 1. Number of valid doc does not include gold standard files that contains no mentions.
        # 2. Gold mention count and system mention count are accumulated, used to compute prec, recall.
        if math.isnan(recall):
            # gold produce no mentions, do nothing
            pass
        elif math.isnan(prec):
            # system produce no mentions, accumulate denominator
            logger.warning('System produce nothing for document [%s], assigning 0 scores' % docId)
            valid_docs += 1
            total_gold_mentions += num_gold_mentions
        else:
            valid_docs += 1
            total_gold_mentions += num_gold_mentions
            total_system_mentions += num_sys_mentions

            for score_index, score in enumerate([tp, fp, prec, recall]):
                plain_global_scores[score_index] += score

    if len(EvalState.doc_coref_scores) > 0:
        mention_eval_out.write("\n\n========Document Mention Corefrence Results (CoNLL Average)==========\n")
        for coref_score, doc_id in EvalState.doc_coref_scores:
            mention_eval_out.write("%s\t%.2f\n" % (doc_id, coref_score))

    per_type_precision, per_type_recall, per_type_f1 = summarize_type_scores()

    mention_eval_out.write("\n\n========Mention Type Results==========\n")
    if len(per_type_f1) > 0:
        max_type_name_width = len(max(per_type_f1.keys(), key=len))
        mention_eval_out.write("%s\tPrec\tRec\tF1\t#Gold\t#Sys\n" % pad_char_before_until("Type", max_type_name_width))
        for mention_type, f1 in sorted(per_type_f1.items()):
            mention_eval_out.write("%s\t%.2f\t%.2f\t%.2f\t%d\t%d\n" % (
                pad_char_before_until(mention_type, max_type_name_width),
                utils.nan_as_zero(utils.get_or_else(per_type_precision, mention_type, 0)),
                utils.nan_as_zero(utils.get_or_else(per_type_recall, mention_type, 0)),
                utils.nan_as_zero(utils.get_or_else(per_type_f1, mention_type, 0)),
                utils.nan_as_zero(utils.get_or_else(EvalState.per_type_num_gold, mention_type, 0)),
                utils.nan_as_zero(utils.get_or_else(EvalState.per_type_num_response, mention_type, 0))
            ))

    # Use the denominators above to calculate the averages.
    plain_average_scores = get_averages(plain_global_scores, total_gold_mentions, total_system_mentions, valid_docs)

    mention_eval_out.write("\n=======Final Mention Detection Results=========\n")
    max_attribute_name_width = len(max(attribute_header_list, key=len))
    attributes_name_header = pad_char_before_until("Attributes", max_attribute_name_width)

    final_result_big_header = ["Micro Average", "Macro Average"]

    mention_eval_out.write(
        pad_char_before_until("", max_attribute_name_width, " ") + "\t" + "\t".join(
            [pad_char_before_until(h, len(small_header_item)) for h in final_result_big_header]) + "\n")
    mention_eval_out.write(attributes_name_header + "\t" + "\t".join([small_header_item] * 2) + "\n")
    mention_eval_out.write(pad_char_before_until(attribute_header_list[0], max_attribute_name_width) + "\t" + "\t".join(
        "%.2f" % f for f in plain_average_scores) + "\n")
    for attr_index, attr_based_score in enumerate(attribute_based_global_scores):
        attr_average_scores = get_averages(attr_based_score, total_gold_mentions, total_system_mentions, valid_docs)
        mention_eval_out.write(
            pad_char_before_until(attribute_header_list[attr_index + 1],
                                  max_attribute_name_width) + "\t" + "\t".join(
                "%.2f" % f for f in attr_average_scores) + "\n")

    if len(EvalState.overall_coref_scores) > 0:
        mention_eval_out.write("\n=======Final Mention Coreference Results=========\n")
        conll_sum = 0.0
        num_metric = 0
        for metric, score in EvalState.overall_coref_scores.iteritems():
            formatter = "Metric : %s\tScore\t%.2f\n"
            if metric in Config.skipped_metrics:
                formatter = "Metric : %s\tScore\t%.2f *\n"
            else:
                conll_sum += score
                num_metric += 1
            mention_eval_out.write(formatter % (metric, score))
        mention_eval_out.write(
            "Overall Average CoNLL score\t%.2f\n" % (conll_sum / num_metric))
        mention_eval_out.write("\n* Score not included for final CoNLL score.\n")

    if Config.temporal_result_dir is not None:
        mention_eval_out.write("\n")
        temporal_output = os.path.join(Config.temporal_result_dir, Config.temporal_out)
        with open(temporal_output, 'r') as f:
            for l in f:
                mention_eval_out.write(l)

    if mention_eval_out is not None:
        mention_eval_out.flush()
    if not mention_eval_out == sys.stdout:
        mention_eval_out.close()