def evaluate(token_dir, coref_out, all_attribute_combinations, token_offset_fields, token_file_ext, diff_out): """ Conduct the main evaluation steps. :param token_dir: :param coref_out: :param all_attribute_combinations: :param token_offset_fields: :param token_file_ext: :param diff_out: :return: """ if EvalState.has_next_doc(): res, (g_mention_lines, g_relation_lines), ( s_mention_lines, s_relation_lines), doc_id, system_id = get_next_doc() else: return False logger.info("Evaluating Document %s" % doc_id) if len(g_mention_lines) == 0: logger.warn( "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro " "score will be fine." % doc_id) invisible_ids = [] if MutableConfig.eval_mode == EvalMethod.Token: invisible_ids, id2token, id2span = read_token_ids( token_dir, doc_id, token_file_ext, token_offset_fields) # Parse the lines and save them as a table from id to content. system_mention_table = [] gold_mention_table = [] # Save the raw text for visualization. sys_id_2_text = {} gold_id_2_text = {} logger.debug("Reading gold and response mentions.") remaining_sys_ids = set() num_system_mentions = 0 for sl in s_mention_lines: parse_result = parse_line(sl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue num_system_mentions += 1 sys_attributes = parse_result[1] sys_mention_id = parse_result[2] text = parse_result[4] system_mention_table.append(parse_result) EvalState.all_possible_types.add(sys_attributes[0]) remaining_sys_ids.add(sys_mention_id) sys_id_2_text[sys_mention_id] = text if not num_system_mentions == len(remaining_sys_ids): logger.warn( "Duplicated mention id for doc %s, one of them is randomly removed." % doc_id) remaining_gold_ids = set() for gl in g_mention_lines: parse_result = parse_line(gl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue gold_attributes = parse_result[1] gold_mention_id = parse_result[2] text = parse_result[4] gold_mention_table.append(parse_result) EvalState.all_possible_types.add(gold_attributes[0]) gold_id_2_text[gold_mention_id] = text remaining_gold_ids.add(gold_mention_id) num_system_predictions = len(system_mention_table) num_gold_predictions = len(gold_mention_table) # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting. all_gold_system_mapping_scores = [] # Debug purpose printing. print_score_matrix = False logger.debug("Computing overlap scores.") for system_index, (sys_spans, sys_attributes, sys_mention_id, _, _) in enumerate(system_mention_table): if print_score_matrix: print("%d %s" % (system_index, sys_mention_id)) for index, (gold_spans, gold_attributes, gold_mention_id, _, _) in enumerate(gold_mention_table): if len(gold_spans) == 0: logger.warning( "Found empty span gold standard at doc : %s, mention : %s" % (doc_id, gold_mention_id)) if len(sys_spans) == 0: logger.warning( "Found empty span system at doc : %s, mention : %s" % (doc_id, sys_mention_id)) overlap = compute_overlap_score(gold_spans, sys_spans) if print_score_matrix: sys.stdout.write("%.1f " % overlap) if overlap > 0: # maintaining a max heap based on overlap score heapq.heappush(all_gold_system_mapping_scores, (-overlap, system_index, index)) if print_score_matrix: print greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy( all_gold_system_mapping_scores, all_attribute_combinations, gold_mention_table, system_mention_table, doc_id) write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n") if diff_out is not None: # Here if you change the mapping used, you will see what's wrong on different level! # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table, # system_mention_table, diff_out) write_gold_and_system_mappings(system_id, greedy_mention_only_mapping, gold_mention_table, system_mention_table, diff_out) attribute_based_fps = [0.0] * len(all_attribute_combinations) for attribute_comb_index, abtp in enumerate(greedy_attribute_tps): attribute_based_fps[ attribute_comb_index] = num_system_predictions - abtp # Unmapped system mentions and the partial scores are considered as false positive. fp = len(remaining_sys_ids) - greedy_tp EvalState.doc_mention_scores.append( (greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps), num_gold_predictions, num_system_predictions, doc_id)) # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria # settings. mention_mapping = None type_mapping = None for attribute_comb_index, attribute_comb in enumerate( all_attribute_combinations): if attribute_comb == Config.coref_criteria: mention_mapping = greedy_all_attribute_mapping[ attribute_comb_index] logger.debug("Select mapping that matches criteria [%s]" % (Config.coref_criteria[0][1])) if attribute_comb[0][1] == "mention_type": type_mapping = greedy_all_attribute_mapping[attribute_comb_index] if Config.coref_criteria == "span_only": mention_mapping = greedy_mention_only_mapping if mention_mapping is None: # In case when we don't do attribute scoring. mention_mapping = greedy_mention_only_mapping # Evaluate how the performance of each type. per_type_eval(system_mention_table, gold_mention_table, type_mapping) gold_directed_relations, gold_corefs = utils.parse_relation_lines( g_relation_lines, remaining_gold_ids) sys_directed_relations, sys_corefs = utils.parse_relation_lines( s_relation_lines, remaining_sys_ids) if Config.script_result_dir: seq_eval = TemporalEval(mention_mapping, gold_mention_table, gold_directed_relations, system_mention_table, sys_directed_relations, gold_corefs, sys_corefs) if not Config.no_script_validation: if not seq_eval.validate_gold(): logger.error( "The gold edges cannot form a valid script graph.") utils.exit_on_fail() if not seq_eval.validate_sys(): logger.error( "The system edges cannot form a valid script graph.") utils.exit_on_fail() seq_eval.write_time_ml(doc_id) # Evaluate coreference links. if coref_out is not None: logger.debug("Start preparing coreference files.") # Prepare CoNLL style coreference input for this document. conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text, gold_id_2_text) gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines( gold_corefs, sys_corefs, gold_mention_table, system_mention_table, mention_mapping, MutableConfig.coref_mention_threshold) # If we are selecting among multiple mappings, it is easy to write in our file. write_mode = 'w' if EvalState.claim_write_flag() else 'a' g_conll_out = open(Config.conll_gold_file, write_mode) s_conll_out = open(Config.conll_sys_file, write_mode) g_conll_out.writelines(gold_conll_lines) s_conll_out.writelines(sys_conll_lines) if diff_out is not None: write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs, gold_id_2_text, sys_id_2_text) write_if_provided(diff_out, Config.eod_marker + " " + "\n") return True
def validate_next(doc_lengths, possible_types, token_dir, token_offset_fields, token_file_ext): global total_mentions global unrecognized_relation_count success = True res, (mention_lines, relation_lines), (_, _), doc_id = get_next_doc() max_length = None if doc_lengths is not None: if doc_id not in doc_lengths: logger.error("Document id not listed in evaluation set : %s", doc_id) success = False else: max_length = doc_lengths[doc_id] if MutableConfig.eval_mode == EvalMethod.Token: invisible_ids, id2token_map, id2span_map = read_token_ids( token_dir, doc_id, token_file_ext, token_offset_fields) else: invisible_ids = set() id2token_map = {} # Parse the lines in file. mention_table = [] mention_ids = [] remaining_gold_ids = set() for l in mention_lines: mention_id, spans, attributes = parse_line(l, invisible_ids) if found_invalid_range(spans, max_length): logger.error( "The following mention line exceed the character range %d of document [%s]" % (max_length, doc_id)) logger.error(l) success = False if possible_types is not None: mtype = canonicalize_string(attributes[0]) if not check_type(possible_types, mtype): logger.error( "Submission contains type [%s] that is not in evaluation." % mtype) success = False mention_table.append((spans, attributes, mention_id)) mention_ids.append(mention_id) all_possible_types.add(attributes[0]) remaining_gold_ids.add(mention_id) total_mentions += len(mention_table) if not check_unique(mention_ids): logger.error("Duplicated mention id for doc %s" % doc_id) success = False if MutableConfig.eval_mode == EvalMethod.Token and has_invented_token( id2token_map, mention_table): logger.error("Invented token id was found for doc %s" % doc_id) logger.error("Tokens not in tbf not found in token map : %d" % total_tokens_not_found) success = False clusters = {} cluster_id = 0 for l in relation_lines: relation = utils.parse_relation_line(l) if relation[0] == Config.coreference_relation_name: clusters[cluster_id] = set(relation[2]) cluster_id += 1 elif relation[0] not in Config.all_relations: unrecognized_relation_count += 1 logger.warning( "Relation [%s] is not recognized, this task only takes: [%s]", relation[0], ";".join(Config.all_relations)) if has_invented_mentions(relation[2], set(mention_ids)): logger.error("This relation was found in file %s" % doc_id) success = False if unrecognized_relation_count > 10: logger.error("Too many unrecognized relations : %d" % unrecognized_relation_count) success = False if transitive_not_resolved(clusters): logger.error( "Coreference transitive closure is not resolved! Please resolve before submitting." ) logger.error("Problem was found in file %s" % doc_id) success = False if EvalMethod.Char: event_mention_id_2_span = get_eid_2_character_span(mention_table) else: event_mention_id_2_span = get_eid_2_sorted_token_map(mention_table) # for cluster_id, cluster in clusters.iteritems(): # if invented_mention_check(cluster, event_mention_id_2_span): # logger.error("Found invented id in clusters at doc [%s]" % doc_id) # success = False directed_relations, corefs = utils.parse_relation_lines( relation_lines, remaining_gold_ids) seq_eval = TemporalEval([], mention_table, directed_relations, [], {}, corefs, []) if not seq_eval.validate_gold(): logger.error( "The edges cannot form a valid script graph at doc [%s]." % doc_id) utils.exit_on_fail() return success