Beispiel #1
0
def evaluate(token_dir, coref_out, all_attribute_combinations,
             token_offset_fields, token_file_ext, diff_out):
    """
    Conduct the main evaluation steps.
    :param token_dir:
    :param coref_out:
    :param all_attribute_combinations:
    :param token_offset_fields:
    :param token_file_ext:
    :param diff_out:
    :return:
    """
    if EvalState.has_next_doc():
        res, (g_mention_lines, g_relation_lines), (
            s_mention_lines,
            s_relation_lines), doc_id, system_id = get_next_doc()
    else:
        return False

    logger.info("Evaluating Document %s" % doc_id)

    if len(g_mention_lines) == 0:
        logger.warn(
            "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro "
            "score will be fine." % doc_id)

    invisible_ids = []
    if MutableConfig.eval_mode == EvalMethod.Token:
        invisible_ids, id2token, id2span = read_token_ids(
            token_dir, doc_id, token_file_ext, token_offset_fields)

    # Parse the lines and save them as a table from id to content.
    system_mention_table = []
    gold_mention_table = []

    # Save the raw text for visualization.
    sys_id_2_text = {}
    gold_id_2_text = {}

    logger.debug("Reading gold and response mentions.")

    remaining_sys_ids = set()
    num_system_mentions = 0
    for sl in s_mention_lines:
        parse_result = parse_line(sl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        num_system_mentions += 1

        sys_attributes = parse_result[1]
        sys_mention_id = parse_result[2]
        text = parse_result[4]

        system_mention_table.append(parse_result)
        EvalState.all_possible_types.add(sys_attributes[0])
        remaining_sys_ids.add(sys_mention_id)
        sys_id_2_text[sys_mention_id] = text

    if not num_system_mentions == len(remaining_sys_ids):
        logger.warn(
            "Duplicated mention id for doc %s, one of them is randomly removed."
            % doc_id)

    remaining_gold_ids = set()
    for gl in g_mention_lines:
        parse_result = parse_line(gl, invisible_ids)

        # If parse result is rejected, we ignore this line.
        if not parse_result:
            continue

        gold_attributes = parse_result[1]
        gold_mention_id = parse_result[2]
        text = parse_result[4]

        gold_mention_table.append(parse_result)
        EvalState.all_possible_types.add(gold_attributes[0])
        gold_id_2_text[gold_mention_id] = text
        remaining_gold_ids.add(gold_mention_id)

    num_system_predictions = len(system_mention_table)
    num_gold_predictions = len(gold_mention_table)

    # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting.
    all_gold_system_mapping_scores = []

    # Debug purpose printing.
    print_score_matrix = False

    logger.debug("Computing overlap scores.")
    for system_index, (sys_spans, sys_attributes, sys_mention_id, _,
                       _) in enumerate(system_mention_table):
        if print_score_matrix:
            print("%d %s" % (system_index, sys_mention_id))
        for index, (gold_spans, gold_attributes, gold_mention_id, _,
                    _) in enumerate(gold_mention_table):
            if len(gold_spans) == 0:
                logger.warning(
                    "Found empty span gold standard at doc : %s, mention : %s"
                    % (doc_id, gold_mention_id))
            if len(sys_spans) == 0:
                logger.warning(
                    "Found empty span system at doc : %s, mention : %s" %
                    (doc_id, sys_mention_id))

            overlap = compute_overlap_score(gold_spans, sys_spans)

            if print_score_matrix:
                sys.stdout.write("%.1f " % overlap)

            if overlap > 0:
                # maintaining a max heap based on overlap score
                heapq.heappush(all_gold_system_mapping_scores,
                               (-overlap, system_index, index))
        if print_score_matrix:
            print

    greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy(
        all_gold_system_mapping_scores, all_attribute_combinations,
        gold_mention_table, system_mention_table, doc_id)

    write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n")
    if diff_out is not None:
        # Here if you change the mapping used, you will see what's wrong on different level!

        # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table,
        #                                system_mention_table, diff_out)

        write_gold_and_system_mappings(system_id, greedy_mention_only_mapping,
                                       gold_mention_table,
                                       system_mention_table, diff_out)

    attribute_based_fps = [0.0] * len(all_attribute_combinations)
    for attribute_comb_index, abtp in enumerate(greedy_attribute_tps):
        attribute_based_fps[
            attribute_comb_index] = num_system_predictions - abtp

    # Unmapped system mentions and the partial scores are considered as false positive.
    fp = len(remaining_sys_ids) - greedy_tp

    EvalState.doc_mention_scores.append(
        (greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps),
         num_gold_predictions, num_system_predictions, doc_id))

    # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get
    # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria
    # settings.
    mention_mapping = None
    type_mapping = None
    for attribute_comb_index, attribute_comb in enumerate(
            all_attribute_combinations):
        if attribute_comb == Config.coref_criteria:
            mention_mapping = greedy_all_attribute_mapping[
                attribute_comb_index]
            logger.debug("Select mapping that matches criteria [%s]" %
                         (Config.coref_criteria[0][1]))
        if attribute_comb[0][1] == "mention_type":
            type_mapping = greedy_all_attribute_mapping[attribute_comb_index]

    if Config.coref_criteria == "span_only":
        mention_mapping = greedy_mention_only_mapping

    if mention_mapping is None:
        # In case when we don't do attribute scoring.
        mention_mapping = greedy_mention_only_mapping

    # Evaluate how the performance of each type.
    per_type_eval(system_mention_table, gold_mention_table, type_mapping)

    gold_directed_relations, gold_corefs = utils.parse_relation_lines(
        g_relation_lines, remaining_gold_ids)
    sys_directed_relations, sys_corefs = utils.parse_relation_lines(
        s_relation_lines, remaining_sys_ids)

    if Config.script_result_dir:
        seq_eval = TemporalEval(mention_mapping, gold_mention_table,
                                gold_directed_relations, system_mention_table,
                                sys_directed_relations, gold_corefs,
                                sys_corefs)

        if not Config.no_script_validation:
            if not seq_eval.validate_gold():
                logger.error(
                    "The gold edges cannot form a valid script graph.")
                utils.exit_on_fail()

            if not seq_eval.validate_sys():
                logger.error(
                    "The system edges cannot form a valid script graph.")
                utils.exit_on_fail()

        seq_eval.write_time_ml(doc_id)

    # Evaluate coreference links.
    if coref_out is not None:
        logger.debug("Start preparing coreference files.")

        # Prepare CoNLL style coreference input for this document.
        conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text,
                                         gold_id_2_text)
        gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines(
            gold_corefs, sys_corefs, gold_mention_table, system_mention_table,
            mention_mapping, MutableConfig.coref_mention_threshold)

        # If we are selecting among multiple mappings, it is easy to write in our file.
        write_mode = 'w' if EvalState.claim_write_flag() else 'a'
        g_conll_out = open(Config.conll_gold_file, write_mode)
        s_conll_out = open(Config.conll_sys_file, write_mode)
        g_conll_out.writelines(gold_conll_lines)
        s_conll_out.writelines(sys_conll_lines)

        if diff_out is not None:
            write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs,
                                         gold_id_2_text, sys_id_2_text)

    write_if_provided(diff_out, Config.eod_marker + " " + "\n")

    return True
Beispiel #2
0
def validate_next(doc_lengths, possible_types, token_dir, token_offset_fields,
                  token_file_ext):
    global total_mentions
    global unrecognized_relation_count

    success = True

    res, (mention_lines, relation_lines), (_, _), doc_id = get_next_doc()

    max_length = None
    if doc_lengths is not None:
        if doc_id not in doc_lengths:
            logger.error("Document id not listed in evaluation set : %s",
                         doc_id)
            success = False
        else:
            max_length = doc_lengths[doc_id]

    if MutableConfig.eval_mode == EvalMethod.Token:
        invisible_ids, id2token_map, id2span_map = read_token_ids(
            token_dir, doc_id, token_file_ext, token_offset_fields)
    else:
        invisible_ids = set()
        id2token_map = {}

    # Parse the lines in file.
    mention_table = []

    mention_ids = []
    remaining_gold_ids = set()

    for l in mention_lines:
        mention_id, spans, attributes = parse_line(l, invisible_ids)

        if found_invalid_range(spans, max_length):
            logger.error(
                "The following mention line exceed the character range %d of document [%s]"
                % (max_length, doc_id))
            logger.error(l)
            success = False

        if possible_types is not None:
            mtype = canonicalize_string(attributes[0])
            if not check_type(possible_types, mtype):
                logger.error(
                    "Submission contains type [%s] that is not in evaluation."
                    % mtype)
                success = False

        mention_table.append((spans, attributes, mention_id))
        mention_ids.append(mention_id)
        all_possible_types.add(attributes[0])
        remaining_gold_ids.add(mention_id)

    total_mentions += len(mention_table)

    if not check_unique(mention_ids):
        logger.error("Duplicated mention id for doc %s" % doc_id)
        success = False

    if MutableConfig.eval_mode == EvalMethod.Token and has_invented_token(
            id2token_map, mention_table):
        logger.error("Invented token id was found for doc %s" % doc_id)
        logger.error("Tokens not in tbf not found in token map : %d" %
                     total_tokens_not_found)
        success = False

    clusters = {}
    cluster_id = 0
    for l in relation_lines:
        relation = utils.parse_relation_line(l)
        if relation[0] == Config.coreference_relation_name:
            clusters[cluster_id] = set(relation[2])
            cluster_id += 1
        elif relation[0] not in Config.all_relations:
            unrecognized_relation_count += 1
            logger.warning(
                "Relation [%s] is not recognized, this task only takes: [%s]",
                relation[0], ";".join(Config.all_relations))

        if has_invented_mentions(relation[2], set(mention_ids)):
            logger.error("This relation was found in file %s" % doc_id)
            success = False

    if unrecognized_relation_count > 10:
        logger.error("Too many unrecognized relations : %d" %
                     unrecognized_relation_count)
        success = False

    if transitive_not_resolved(clusters):
        logger.error(
            "Coreference transitive closure is not resolved! Please resolve before submitting."
        )
        logger.error("Problem was found in file %s" % doc_id)
        success = False

    if EvalMethod.Char:
        event_mention_id_2_span = get_eid_2_character_span(mention_table)
    else:
        event_mention_id_2_span = get_eid_2_sorted_token_map(mention_table)

    # for cluster_id, cluster in clusters.iteritems():
    #     if invented_mention_check(cluster, event_mention_id_2_span):
    #         logger.error("Found invented id in clusters at doc [%s]" % doc_id)
    #         success = False

    directed_relations, corefs = utils.parse_relation_lines(
        relation_lines, remaining_gold_ids)

    seq_eval = TemporalEval([], mention_table, directed_relations, [], {},
                            corefs, [])
    if not seq_eval.validate_gold():
        logger.error(
            "The edges cannot form a valid script graph at doc [%s]." % doc_id)
        utils.exit_on_fail()

    return success