Exemple #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Event mention scorer, provides support to Event Nugget scoring, Event Coreference and Event "
        "Sequencing scoring.")
    parser.add_argument("-g", "--gold", help="Golden Standard", required=True)
    parser.add_argument("-s", "--system", help="System output", required=True)
    parser.add_argument("-d",
                        "--comparison_output",
                        help="Compare and help show the difference between "
                        "system and gold")
    parser.add_argument(
        "-o",
        "--output",
        help="Optional evaluation result redirects, put eval result to file")
    parser.add_argument(
        "-c",
        "--coref",
        help="Eval Coreference result output, need to put the reference"
        "conll coref scorer in the same folder with this scorer")
    parser.add_argument(
        "-a",
        "--sequencing",
        help="Eval Event sequencing result output (After and Subevent)")
    parser.add_argument("-nv",
                        "--no_script_validation",
                        help="Whether to turn off script validation",
                        action="store_true")
    parser.add_argument(
        "-t",
        "--token_path",
        help=
        "Path to the directory containing the token mappings file, only used in token mode."
    )
    parser.add_argument(
        "-m",
        "--coref_mapping",
        help="Which mapping will be used to perform coreference mapping.",
        type=int)
    parser.add_argument(
        "-of",
        "--offset_field",
        help="A pair of integer indicates which column we should "
        "read the offset in the token mapping file, index starts"
        "at 0, default value will be %s" % Config.default_token_offset_fields)
    parser.add_argument(
        "-te",
        "--token_table_extension",
        help=
        "any extension appended after docid of token table files. Default is [%s], only used in token mode."
        % Config.default_token_file_ext)
    parser.add_argument("-ct",
                        "--coreference_threshold",
                        type=float,
                        help="Threshold for coreference mention mapping")
    parser.add_argument("-b",
                        "--debug",
                        help="turn debug mode on",
                        action="store_true")

    # parser.add_argument("--eval_mode", choices=["char", "token"], default="char",
    #                     help="Use Span or Token mode. The Span mode will take a span as range [start:end], while the "
    #                          "Token mode consider each token is provided as a single id.")

    parser.add_argument(
        "-wl",
        "--type_white_list",
        type=argparse.FileType('r'),
        help=
        "Provide a file, where each line list a mention type subtype pair to be evaluated. Types "
        "that are out of this white list will be ignored.")

    parser.add_argument("-dn",
                        "--doc_id_to_eval",
                        help="Provide one single doc id to evaluate.")

    parser.set_defaults(debug=False)
    args = parser.parse_args()

    if args.debug:
        stream_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.debug("Entered debug mode.")
    else:
        stream_handler.setLevel(logging.INFO)
        logger.setLevel(logging.INFO)

    if args.type_white_list is not None:
        logger.info(
            "Only the following types in the white list will be evaluated.")
        EvalState.white_listed_types = set()
        for line in args.type_white_list:
            logger.info(line.strip())
            EvalState.white_listed_types.add(canonicalize_string(line))

    if args.output is not None:
        out_path = args.output
        utils.create_parent_dir(out_path)
        mention_eval_out = open(out_path, 'w')
        logger.info("Evaluation output will be saved at %s" % out_path)
    else:
        mention_eval_out = sys.stdout
        logger.info("Evaluation output at standard out.")

    if os.path.isfile(args.gold):
        gf = open(args.gold)
    else:
        logger.error("Cannot find gold standard file at " + args.gold)
        sys.exit(1)

    if args.coref is not None:
        Config.conll_out = args.coref
        Config.conll_gold_file = args.coref + "_gold.conll"
        Config.conll_sys_file = args.coref + "_sys.conll"

        logger.info("CoNLL script output will be output at " +
                    Config.conll_out)

        logger.info("Gold and system conll files will generated at " +
                    Config.conll_gold_file + " and " + Config.conll_sys_file)

    if args.sequencing is not None:
        Config.script_result_dir = args.sequencing

        logger.info("Temporal files will be output at " +
                    Config.script_result_dir)
        utils.supermakedirs(Config.script_result_dir)

        logger.info("Will evaluate link type: %s." %
                    ",".join(Config.script_types))
        for t in Config.script_types:
            utils.supermakedirs(os.path.join(Config.script_result_dir, t))

        utils.remove_file_by_extension(Config.script_result_dir, ".tml")
        utils.remove_file_by_extension(Config.script_result_dir, ".tml")

        if args.no_script_validation:
            Config.no_script_validation = True

    if os.path.isfile(args.system):
        sf = open(args.system)
    else:
        logger.error("Cannot find system file at " + args.system)
        sys.exit(1)

    if args.coref_mapping is not None:
        if args.coref_mapping < 4:
            Config.coref_criteria = Config.possible_coref_mapping[
                args.coref_mapping]
        else:
            logger.error(
                "Possible mapping : 0: Span only 1: Mention Type 2: Realis 3 Type and Realis"
            )
            utils.terminate_with_error("Must provide a mapping between 0 to 3")
    else:
        Config.coref_criteria = Config.possible_coref_mapping[1]

    diff_out = None
    if args.comparison_output is not None:
        diff_out_path = args.comparison_output
        utils.create_parent_dir(diff_out_path)
        diff_out = open(diff_out_path, 'w')

    token_dir = "."
    if args.token_path is not None:
        MutableConfig.eval_mode = EvalMethod.Token
        logger.info("Eval mode is set to token.")
        if os.path.isdir(args.token_path):
            logger.debug("Will search token files in " + args.token_path)
            token_dir = args.token_path
        else:
            logger.debug("Cannot find given token directory at [%s], "
                         "will try search for current directory" %
                         args.token_path)
    else:
        MutableConfig.eval_mode = EvalMethod.Char

    token_offset_fields = Config.default_token_offset_fields
    if args.offset_field is not None:
        try:
            token_offset_fields = [
                int(x) for x in args.offset_field.split(",")
            ]
        except ValueError as _:
            logger.error(
                "Token offset argument should be two integer with comma in between, i.e. 2,3"
            )

    if args.coreference_threshold is not None:
        MutableConfig.coref_mention_threshold = args.coreference_threshold

    # Read all documents.
    read_all_doc(gf, sf, args.doc_id_to_eval)

    # Take all attribute combinations, which will be used to produce scores.
    attribute_comb = get_attr_combinations(Config.attribute_names)

    logger.info("Coreference mentions need to match %s before consideration" %
                Config.coref_criteria[0][1])

    while True:
        print('dir is:', token_dir)
        if not evaluate(token_dir, args.coref, attribute_comb,
                        token_offset_fields, args.token_table_extension,
                        diff_out):
            break

    # Run the CoNLL script on the combined files, which is concatenated from the best alignment of all documents.
    if args.coref is not None:
        logger.debug("Running coreference script for the final scores.")
        ConllEvaluator.run_conll_script(Config.conll_gold_file,
                                        Config.conll_sys_file,
                                        Config.conll_out)
        # Get the CoNLL scores from output
        EvalState.overall_coref_scores = ConllEvaluator.get_conll_scores(
            Config.conll_out)

    # Run the TimeML evaluation script.
    if Config.script_result_dir:
        TemporalEval.eval_time_ml()

    print_eval_results(mention_eval_out, attribute_comb)

    # Clean up, close files.
    close_if_not_none(diff_out)

    logger.info("Evaluation Done.")
    return 0
Exemple #2
0
def evaluate(token_dir, coref_out, all_attribute_combinations,
             token_offset_fields, token_file_ext, diff_out):
    """
    Conduct the main evaluation steps.
    :param token_dir:
    :param coref_out:
    :param all_attribute_combinations:
    :param token_offset_fields:
    :param token_file_ext:
    :param diff_out:
    :return:
    """
    if EvalState.has_next_doc():
        res, (g_mention_lines, g_relation_lines), (
            s_mention_lines,
            s_relation_lines), doc_id, system_id = get_next_doc()
    else:
        return False

    logger.info("Evaluating Document %s" % doc_id)

    if len(g_mention_lines) == 0:
        logger.warn(
            "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro "
            "score will be fine." % doc_id)

    invisible_ids = []
    if MutableConfig.eval_mode == EvalMethod.Token:
        invisible_ids, id2token, id2span = read_token_ids(
            token_dir, doc_id, token_file_ext, token_offset_fields)

    # Parse the lines and save them as a table from id to content.
    system_mention_table = []
    gold_mention_table = []

    # Save the raw text for visualization.
    sys_id_2_text = {}
    gold_id_2_text = {}

    logger.debug("Reading gold and response mentions.")

    remaining_sys_ids = set()
    num_system_mentions = 0
    for sl in s_mention_lines:
        parse_result = parse_line(sl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        num_system_mentions += 1

        sys_attributes = parse_result[1]
        sys_mention_id = parse_result[2]
        text = parse_result[4]

        system_mention_table.append(parse_result)
        EvalState.all_possible_types.add(sys_attributes[0])
        remaining_sys_ids.add(sys_mention_id)
        sys_id_2_text[sys_mention_id] = text

    if not num_system_mentions == len(remaining_sys_ids):
        logger.warn(
            "Duplicated mention id for doc %s, one of them is randomly removed."
            % doc_id)

    remaining_gold_ids = set()
    for gl in g_mention_lines:
        parse_result = parse_line(gl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        gold_attributes = parse_result[1]
        gold_mention_id = parse_result[2]
        text = parse_result[4]

        gold_mention_table.append(parse_result)
        EvalState.all_possible_types.add(gold_attributes[0])
        gold_id_2_text[gold_mention_id] = text
        remaining_gold_ids.add(gold_mention_id)

    num_system_predictions = len(system_mention_table)
    num_gold_predictions = len(gold_mention_table)

    # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting.
    all_gold_system_mapping_scores = []

    # Debug purpose printing.
    print_score_matrix = False

    logger.debug("Computing overlap scores.")
    for system_index, (sys_spans, sys_attributes, sys_mention_id, _,
                       _) in enumerate(system_mention_table):
        if print_score_matrix:
            print("%d %s" % (system_index, sys_mention_id))
        for index, (gold_spans, gold_attributes, gold_mention_id, _,
                    _) in enumerate(gold_mention_table):
            if len(gold_spans) == 0:
                logger.warning(
                    "Found empty span gold standard at doc : %s, mention : %s"
                    % (doc_id, gold_mention_id))
            if len(sys_spans) == 0:
                logger.warning(
                    "Found empty span system at doc : %s, mention : %s" %
                    (doc_id, sys_mention_id))

            overlap = compute_overlap_score(gold_spans, sys_spans)

            if print_score_matrix:
                sys.stdout.write("%.1f " % overlap)

            if overlap > 0:
                # maintaining a max heap based on overlap score
                heapq.heappush(all_gold_system_mapping_scores,
                               (-overlap, system_index, index))
        if print_score_matrix:
            print

    greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy(
        all_gold_system_mapping_scores, all_attribute_combinations,
        gold_mention_table, system_mention_table, doc_id)

    write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n")
    if diff_out is not None:
        # Here if you change the mapping used, you will see what's wrong on different level!

        # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table,
        #                                system_mention_table, diff_out)

        write_gold_and_system_mappings(system_id, greedy_mention_only_mapping,
                                       gold_mention_table,
                                       system_mention_table, diff_out)

    attribute_based_fps = [0.0] * len(all_attribute_combinations)
    for attribute_comb_index, abtp in enumerate(greedy_attribute_tps):
        attribute_based_fps[
            attribute_comb_index] = num_system_predictions - abtp

    # Unmapped system mentions and the partial scores are considered as false positive.
    fp = len(remaining_sys_ids) - greedy_tp

    EvalState.doc_mention_scores.append(
        (greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps),
         num_gold_predictions, num_system_predictions, doc_id))

    # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get
    # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria
    # settings.
    mention_mapping = None
    type_mapping = None
    for attribute_comb_index, attribute_comb in enumerate(
            all_attribute_combinations):
        if attribute_comb == Config.coref_criteria:
            mention_mapping = greedy_all_attribute_mapping[
                attribute_comb_index]
            logger.debug("Select mapping that matches criteria [%s]" %
                         (Config.coref_criteria[0][1]))
        if attribute_comb[0][1] == "mention_type":
            type_mapping = greedy_all_attribute_mapping[attribute_comb_index]

    if Config.coref_criteria == "span_only":
        mention_mapping = greedy_mention_only_mapping

    if mention_mapping is None:
        # In case when we don't do attribute scoring.
        mention_mapping = greedy_mention_only_mapping

    # Evaluate how the performance of each type.
    per_type_eval(system_mention_table, gold_mention_table, type_mapping)

    gold_directed_relations, gold_corefs = utils.parse_relation_lines(
        g_relation_lines, remaining_gold_ids)
    sys_directed_relations, sys_corefs = utils.parse_relation_lines(
        s_relation_lines, remaining_sys_ids)

    if Config.script_result_dir:
        seq_eval = TemporalEval(mention_mapping, gold_mention_table,
                                gold_directed_relations, system_mention_table,
                                sys_directed_relations, gold_corefs,
                                sys_corefs)

        if not Config.no_script_validation:
            if not seq_eval.validate_gold():
                logger.error(
                    "The gold edges cannot form a valid script graph.")
                utils.exit_on_fail()

            if not seq_eval.validate_sys():
                logger.error(
                    "The system edges cannot form a valid script graph.")
                utils.exit_on_fail()

        seq_eval.write_time_ml(doc_id)

    # Evaluate coreference links.
    if coref_out is not None:
        logger.debug("Start preparing coreference files.")

        # Prepare CoNLL style coreference input for this document.
        conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text,
                                         gold_id_2_text)
        gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines(
            gold_corefs, sys_corefs, gold_mention_table, system_mention_table,
            mention_mapping, MutableConfig.coref_mention_threshold)

        # If we are selecting among multiple mappings, it is easy to write in our file.
        write_mode = 'w' if EvalState.claim_write_flag() else 'a'
        g_conll_out = open(Config.conll_gold_file, write_mode)
        s_conll_out = open(Config.conll_sys_file, write_mode)
        g_conll_out.writelines(gold_conll_lines)
        s_conll_out.writelines(sys_conll_lines)

        if diff_out is not None:
            write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs,
                                         gold_id_2_text, sys_id_2_text)

    write_if_provided(diff_out, Config.eod_marker + " " + "\n")

    return True
Exemple #3
0
def evaluate(token_dir, coref_out, all_attribute_combinations, token_offset_fields, token_file_ext, diff_out):
    """
    Conduct the main evaluation steps.
    :param token_dir:
    :param coref_out:
    :param all_attribute_combinations:
    :param token_offset_fields:
    :param token_file_ext:
    :param diff_out:
    :return:
    """
    if EvalState.has_next_doc():
        res, (g_mention_lines, g_relation_lines), (
            s_mention_lines, s_relation_lines), doc_id, system_id = get_next_doc()
    else:
        return False

    logger.info("Evaluating Document %s" % doc_id)

    if len(g_mention_lines) == 0:
        logger.warn(
            "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro "
            "score will be fine." % doc_id)

    invisible_ids = []
    if MutableConfig.eval_mode == EvalMethod.Token:
        invisible_ids, id2token, id2span = read_token_ids(token_dir, doc_id, token_file_ext, token_offset_fields)

    # Parse the lines and save them as a table from id to content.
    system_mention_table = []
    gold_mention_table = []

    # Save the raw text for visualization.
    sys_id_2_text = {}
    gold_id_2_text = {}

    logger.debug("Reading gold and response mentions.")

    sys_mention_ids = []
    for sl in s_mention_lines:
        parse_result = parse_line(sl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        # if len(sys_spans) == 0:
        #     # Temporarily ignoring empty mentions.
        #     continue

        sys_attributes = parse_result[1]
        sys_mention_id = parse_result[2]
        text = parse_result[4]

        system_mention_table.append(parse_result)
        EvalState.all_possible_types.add(sys_attributes[0])
        sys_mention_ids.append(sys_mention_id)
        sys_id_2_text[sys_mention_id] = text

    remaining_sys_ids = set(sys_mention_ids)
    if not len(sys_mention_ids) == len(remaining_sys_ids):
        logger.error("Duplicated mention id for doc %s" % doc_id)
        return False

    remaining_gold_ids = set()
    for gl in g_mention_lines:
        parse_result = parse_line(gl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        gold_attributes = parse_result[1]
        gold_mention_id = parse_result[2]
        text = parse_result[4]

        gold_mention_table.append(parse_result)
        EvalState.all_possible_types.add(gold_attributes[0])
        gold_id_2_text[gold_mention_id] = text
        remaining_gold_ids.add(gold_mention_id)

    num_system_predictions = len(system_mention_table)
    num_gold_predictions = len(gold_mention_table)

    # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting.
    all_gold_system_mapping_scores = []

    # Debug purpose printing.
    print_score_matrix = False

    logger.debug("Computing overlap scores.")
    for system_index, (sys_spans, sys_attributes, sys_mention_id, _, _) in enumerate(system_mention_table):
        if print_score_matrix:
            print system_index, sys_mention_id,
        for index, (gold_spans, gold_attributes, gold_mention_id, _, _) in enumerate(gold_mention_table):
            if len(gold_spans) == 0:
                logger.warning("Found empty span gold standard at doc : %s, mention : %s" % (doc_id, gold_mention_id))
            if len(sys_spans) == 0:
                logger.warning("Found empty span system standard at doc : %s, mention : %s" % (doc_id, sys_mention_id))

            overlap = compute_overlap_score(gold_spans, sys_spans)

            if print_score_matrix:
                print "%.1f" % overlap,

            if overlap > 0:
                # maintaining a max heap based on overlap score
                heapq.heappush(all_gold_system_mapping_scores, (-overlap, system_index, index))
        if print_score_matrix:
            print

    greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy(
        all_gold_system_mapping_scores, all_attribute_combinations, gold_mention_table,
        system_mention_table, doc_id)

    write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n")
    if diff_out is not None:
        # Here if you change the mapping used, you will see what's wrong on different level!

        # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table,
        #                                system_mention_table, diff_out)

        write_gold_and_system_mappings(doc_id, system_id, greedy_mention_only_mapping, gold_mention_table,
                                       system_mention_table, diff_out)

    attribute_based_fps = [0.0] * len(all_attribute_combinations)
    for attribute_comb_index, abtp in enumerate(greedy_attribute_tps):
        attribute_based_fps[attribute_comb_index] = num_system_predictions - abtp

    # Unmapped system mentions and the partial scores are considered as false positive.
    fp = len(sys_mention_ids) - greedy_tp

    EvalState.doc_mention_scores.append((greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps),
                                         num_gold_predictions, num_system_predictions, doc_id))

    # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get
    # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria
    # settings.
    coref_mapping = None
    type_mapping = None
    for attribute_comb_index, attribute_comb in enumerate(all_attribute_combinations):
        if attribute_comb == Config.coref_criteria:
            coref_mapping = greedy_all_attribute_mapping[attribute_comb_index]
            logger.debug("Select mapping that matches criteria [%s]" % (Config.coref_criteria[0][1]))
        if attribute_comb[0][1] == "mention_type":
            type_mapping = greedy_all_attribute_mapping[attribute_comb_index]

    if Config.coref_criteria == "span_only":
        coref_mapping = greedy_mention_only_mapping

    # Evaluate how the performance of each type.
    per_type_eval(system_mention_table, gold_mention_table, type_mapping)

    # Parse relations.
    g_relations = [parse_relation(l) for l in g_relation_lines]
    s_relations = [parse_relation(l) for l in s_relation_lines]

    if EvalState.white_listed_types:
        g_relations = filter_relations(g_relations, remaining_gold_ids)
        s_relations = filter_relations(s_relations, remaining_sys_ids)

    if coref_mapping is None:
        # In case when we don't do attribute scoring.
        coref_mapping = greedy_mention_only_mapping

    # Evaluate after links.
    gold_afters = [after for after in g_relations if after[0] == Config.after_relation_name]
    sys_afters = [after for after in s_relations if after[0] == Config.after_relation_name]

    after_eval = TemporalEval(doc_id, coref_mapping, gold_mention_table, gold_afters, system_mention_table, sys_afters)
    after_eval.write_time_ml()

    # Evaluate coreference links.
    if coref_out is not None:
        logger.debug("Start preparing coreference files.")

        gold_corefs = [coref for coref in g_relations if coref[0] == Config.coreference_relation_name]

        sys_corefs = [coref for coref in s_relations if coref[0] == Config.coreference_relation_name]

        # Prepare CoNLL style coreference input for this document.
        conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text, gold_id_2_text)
        gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines(gold_corefs, sys_corefs,
                                                                                gold_mention_table,
                                                                                system_mention_table,
                                                                                coref_mapping,
                                                                                MutableConfig.coref_mention_threshold)

        # If we are selecting among multiple mappings, it is easy to write in our file.
        write_mode = 'w' if EvalState.claim_write_flag() else 'a'
        g_conll_out = open(Config.conll_gold_file, write_mode)
        s_conll_out = open(Config.conll_sys_file, write_mode)
        g_conll_out.writelines(gold_conll_lines)
        s_conll_out.writelines(sys_conll_lines)

        if diff_out is not None:
            write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs, gold_id_2_text, sys_id_2_text)

    write_if_provided(diff_out, Config.eod_marker + " " + "\n")

    return True
Exemple #4
0
def validate_next(doc_lengths, possible_types, token_dir, token_offset_fields,
                  token_file_ext):
    global total_mentions
    global unrecognized_relation_count

    success = True

    res, (mention_lines, relation_lines), (_, _), doc_id = get_next_doc()

    max_length = None
    if doc_lengths is not None:
        if doc_id not in doc_lengths:
            logger.error("Document id not listed in evaluation set : %s",
                         doc_id)
            success = False
        else:
            max_length = doc_lengths[doc_id]

    if MutableConfig.eval_mode == EvalMethod.Token:
        invisible_ids, id2token_map, id2span_map = read_token_ids(
            token_dir, doc_id, token_file_ext, token_offset_fields)
    else:
        invisible_ids = set()
        id2token_map = {}

    # Parse the lines in file.
    mention_table = []

    mention_ids = []
    remaining_gold_ids = set()

    for l in mention_lines:
        mention_id, spans, attributes = parse_line(l, invisible_ids)

        if found_invalid_range(spans, max_length):
            logger.error(
                "The following mention line exceed the character range %d of document [%s]"
                % (max_length, doc_id))
            logger.error(l)
            success = False

        if possible_types is not None:
            mtype = canonicalize_string(attributes[0])
            if not check_type(possible_types, mtype):
                logger.error(
                    "Submission contains type [%s] that is not in evaluation."
                    % mtype)
                success = False

        mention_table.append((spans, attributes, mention_id))
        mention_ids.append(mention_id)
        all_possible_types.add(attributes[0])
        remaining_gold_ids.add(mention_id)

    total_mentions += len(mention_table)

    if not check_unique(mention_ids):
        logger.error("Duplicated mention id for doc %s" % doc_id)
        success = False

    if MutableConfig.eval_mode == EvalMethod.Token and has_invented_token(
            id2token_map, mention_table):
        logger.error("Invented token id was found for doc %s" % doc_id)
        logger.error("Tokens not in tbf not found in token map : %d" %
                     total_tokens_not_found)
        success = False

    clusters = {}
    cluster_id = 0
    for l in relation_lines:
        relation = utils.parse_relation_line(l)
        if relation[0] == Config.coreference_relation_name:
            clusters[cluster_id] = set(relation[2])
            cluster_id += 1
        elif relation[0] not in Config.all_relations:
            unrecognized_relation_count += 1
            logger.warning(
                "Relation [%s] is not recognized, this task only takes: [%s]",
                relation[0], ";".join(Config.all_relations))

        if has_invented_mentions(relation[2], set(mention_ids)):
            logger.error("This relation was found in file %s" % doc_id)
            success = False

    if unrecognized_relation_count > 10:
        logger.error("Too many unrecognized relations : %d" %
                     unrecognized_relation_count)
        success = False

    if transitive_not_resolved(clusters):
        logger.error(
            "Coreference transitive closure is not resolved! Please resolve before submitting."
        )
        logger.error("Problem was found in file %s" % doc_id)
        success = False

    if EvalMethod.Char:
        event_mention_id_2_span = get_eid_2_character_span(mention_table)
    else:
        event_mention_id_2_span = get_eid_2_sorted_token_map(mention_table)

    # for cluster_id, cluster in clusters.iteritems():
    #     if invented_mention_check(cluster, event_mention_id_2_span):
    #         logger.error("Found invented id in clusters at doc [%s]" % doc_id)
    #         success = False

    directed_relations, corefs = utils.parse_relation_lines(
        relation_lines, remaining_gold_ids)

    seq_eval = TemporalEval([], mention_table, directed_relations, [], {},
                            corefs, [])
    if not seq_eval.validate_gold():
        logger.error(
            "The edges cannot form a valid script graph at doc [%s]." % doc_id)
        utils.exit_on_fail()

    return success
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(
        description="Event mention scorer, which conducts token based "
                    "scoring, system and gold standard files should follows "
                    "the token-based format.")
    parser.add_argument("-g", "--gold", help="Golden Standard", required=True)
    parser.add_argument("-s", "--system", help="System output", required=True)
    parser.add_argument("-d", "--comparison_output",
                        help="Compare and help show the difference between "
                             "system and gold")
    parser.add_argument(
        "-o", "--output", help="Optional evaluation result redirects, put eval result to file")
    parser.add_argument(
        "-c", "--coref", help="Eval Coreference result output, need to put the reference"
                              "conll coref scorer in the same folder with this scorer")
    parser.add_argument(
        "-a", "--sequencing", help="Eval Event sequencing result output (After and Subevent)"
    )
    parser.add_argument(
        "-t", "--token_path", help="Path to the directory containing the "
                                   "token mappings file")
    parser.add_argument(
        "-m", "--coref_mapping", help="Which mapping will be used to perform coreference mapping.", type=int
    )
    parser.add_argument(
        "-of", "--offset_field", help="A pair of integer indicates which column we should "
                                      "read the offset in the token mapping file, index starts"
                                      "at 0, default value will be %s" % Config.default_token_offset_fields
    )
    parser.add_argument(
        "-te", "--token_table_extension",
        help="any extension appended after docid of token table files. "
             "Default is [%s]" % Config.default_token_file_ext)
    parser.add_argument("-ct", "--coreference_threshold", type=float, help="Threshold for coreference mention mapping")
    parser.add_argument(
        "-b", "--debug", help="turn debug mode on", action="store_true")

    parser.add_argument("--eval_mode", choices=["char", "token"], default="char",
                        help="Use Span Overlap or Token Overlap mode. The Span Overlap mode will take a span as range "
                             "[start:end], while the Token Overlap mode consider each token is provided as a single "
                             "id.")

    parser.add_argument("-wl", "--type_white_list", type=argparse.FileType('r'),
                        help="Provide a file, where each line list a mention type subtype pair to be evaluated. Types "
                             "that are out of this white list will be ignored.")

    parser.add_argument(
        "-dn", "--doc_id_to_eval", help="Provide one single doc id to evaluate."
    )

    parser.set_defaults(debug=False)
    args = parser.parse_args()

    if args.debug:
        stream_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.debug("Entered debug mode.")
    else:
        stream_handler.setLevel(logging.INFO)
        logger.setLevel(logging.INFO)

    if args.type_white_list is not None:
        logger.info("Only the following types in the white list will be evaluated.")
        EvalState.white_listed_types = set()
        for line in args.type_white_list:
            logger.info(line.strip())
            EvalState.white_listed_types.add(canonicalize_string(line))

    if args.eval_mode == "char":
        MutableConfig.eval_mode = EvalMethod.Char
    else:
        MutableConfig.eval_mode = EvalMethod.Token

    if args.output is not None:
        out_path = args.output
        utils.create_parent_dir(out_path)
        mention_eval_out = open(out_path, 'w')
        logger.info("Evaluation output will be saved at %s" % out_path)
    else:
        mention_eval_out = sys.stdout
        logger.info("Evaluation output at standard out.")

    if os.path.isfile(args.gold):
        gf = open(args.gold)
    else:
        logger.error("Cannot find gold standard file at " + args.gold)
        sys.exit(1)

    if args.coref is not None:
        Config.conll_out = args.coref
        Config.conll_gold_file = args.coref + "_gold.conll"
        Config.conll_sys_file = args.coref + "_sys.conll"

        logger.info("CoNLL script output will be output at " + Config.conll_out)

        logger.info(
            "Gold and system conll files will generated at " + Config.conll_gold_file + " and " + Config.conll_sys_file)
        # if os.path.exists(Config.conll_tmp_marker):
        #     # Clean up the directory to avoid scoring errors.
        #     remove_conll_tmp()
        # supermakedirs(Config.conll_tmp_marker)

    if args.sequencing is not None:
        Config.temporal_result_dir = args.sequencing
        utils.supermakedirs(os.path.join(Config.temporal_result_dir, Config.temporal_gold_dir))
        utils.supermakedirs(os.path.join(Config.temporal_result_dir, Config.temporal_sys_dir))

    if os.path.isfile(args.system):
        sf = open(args.system)
    else:
        logger.error("Cannot find system file at " + args.system)
        sys.exit(1)

    if args.coref_mapping is not None:
        if args.coref_mapping < 4:
            Config.coref_criteria = Config.possible_coref_mapping[args.coref_mapping]
        else:
            logger.error("Possible mapping : 0: Span only 1: Mention Type 2: Realis 3 Type and Realis")
            utils.terminate_with_error("Must provide a mapping between 0 to 3")
    else:
        Config.coref_criteria = Config.possible_coref_mapping[1]

    diff_out = None
    if args.comparison_output is not None:
        diff_out_path = args.comparison_output
        utils.create_parent_dir(diff_out_path)
        diff_out = open(diff_out_path, 'w')

    token_dir = "."
    if args.token_path is not None:
        if args.eval_mode == EvalMethod.Token:
            utils.terminate_with_error("Token table (-t) must be provided in token mode")
        if os.path.isdir(args.token_path):
            logger.debug("Will search token files in " + args.token_path)
            token_dir = args.token_path
        else:
            logger.debug("Cannot find given token directory at [%s], "
                         "will try search for current directory" % args.token_path)

    token_offset_fields = Config.default_token_offset_fields
    if args.offset_field is not None:
        try:
            token_offset_fields = [int(x) for x in args.offset_field.split(",")]
        except ValueError as _:
            logger.error("Token offset argument should be two integer with comma in between, i.e. 2,3")

    if args.coreference_threshold is not None:
        MutableConfig.coref_mention_threshold = args.coreference_threshold

    # Read all documents.
    read_all_doc(gf, sf, args.doc_id_to_eval)

    # Take all attribute combinations, which will be used to produce scores.
    attribute_comb = get_attr_combinations(Config.attribute_names)

    logger.info("Coreference mentions need to match %s before consideration" % Config.coref_criteria[0][1])

    while True:
        if not evaluate(token_dir, args.coref, attribute_comb,
                        token_offset_fields, args.token_table_extension,
                        diff_out):
            break

    # Run the CoNLL script on the combined files, which is concatenated from the best alignment of all documents.
    if args.coref is not None:
        logger.debug("Running coreference script for the final scores.")
        ConllEvaluator.run_conll_script(Config.conll_gold_file, Config.conll_sys_file, Config.conll_out)
        # Get the CoNLL scores from output
        EvalState.overall_coref_scores = ConllEvaluator.get_conll_scores(Config.conll_out)

    # Run the TimeML evaluation script.
    TemporalEval.eval_time_ml()

    print_eval_results(mention_eval_out, attribute_comb)

    # Clean up, close files.
    close_if_not_none(diff_out)

    logger.info("Evaluation Done.")