def calc_matrix(matrix, gold_mentions, pred_mentions, gold_clusters, pred_clusters, sents, trees, heads): for id, cluster in gold_clusters.items(): cluster_size = len(cluster) cluster = sorted(cluster) for i in xrange(cluster_size): for j in xrange(0, i): type_i = coref.mention_type(cluster[i], sents, trees, heads) type_j = coref.mention_type(cluster[j], sents, trees, heads) matrix[type_i, type_j, 'total_gold_pairs'] += 1 if (cluster[i] in pred_mentions and cluster[j] in pred_mentions and pred_mentions[cluster[i]] == pred_mentions[cluster[j]]): matrix[type_i, type_j, 'correct_gold_pairs'] += 1 for id, cluster in pred_clusters.items(): cluster_size = len(cluster) cluster = sorted(cluster) for i in xrange(cluster_size): for j in xrange(0, i): type_i = coref.mention_type(cluster[i], sents, trees, heads) type_j = coref.mention_type(cluster[j], sents, trees, heads) matrix[type_i, type_j, 'total_pred_pairs'] += 1 if (cluster[i] in gold_mentions and cluster[j] in gold_mentions and gold_mentions[cluster[i]] == gold_mentions[cluster[j]]): matrix[type_i, type_j, 'correct_pred_pairs'] += 1
def evaluate(data, eval_by_types=False): data_stat = {'files': 0, 'docs': 0, 'sents': 0, 'words': 0} stat = {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0, 'den_pre': 0.0} type_stat = {'name': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0, 'den_pre': 0.0}, 'nominal': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0, 'den_pre': 0.0}, 'pronoun': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0, 'den_pre': 0.0}} for doc in data: data_stat['files'] += 1 for part in data[doc]: sents = data[doc][part]['text'] trees = data[doc][part]['parses'] heads = data[doc][part]['heads'] gold = data[doc][part]['mentions'] for i in xrange(len(sents)): data_stat['words'] += len(sents[i]) data_stat['docs'] += 1 data_stat['sents'] += len(sents) for m in data[doc][part]['pred_mentions']: stat['den_pre'] += 1 mtype = coreference.mention_type(m, sents, trees, heads) type_stat[mtype]['den_pre'] += 1 if m in gold: type_stat[mtype]['num_pre'] += 1 type_stat[mtype]['num_rec'] += 1 stat['num_pre'] += 1 stat['num_rec'] += 1 for g in gold: mtype = coreference.mention_type(g, sents, trees, heads) type_stat[mtype]['den_rec'] += 1 stat['den_rec'] += 1 r, p, f1 = error_analyzer.calc_rpf1(stat) logger.info("Data statistics:\n " "files = %d, docs = %d, sentences = %d, words = %d" % ( data_stat['files'], data_stat['docs'], data_stat['sents'], data_stat['words'])) if eval_by_types: print_rpf1_by_types(type_stat, stat) logger.info("Performance of mention detection:\n " "P = %2.2lf%% (%d/%d), R = %2.2lf%% (%d/%d), F1 = %2.2lf%%" % ((p * 100), stat['num_pre'], stat['den_pre'], (r * 100), stat['num_rec'], stat['den_rec'], (f1 * 100))) return True
def init(doc_ments, sents, trees, heads, sner, speakers): doc_attrs = {} for sent_ments in doc_ments: for ment in sent_ments: attr = {} attr["type"] = my_constant.MAP_MTYPES[coref.mention_type(ment, sents, trees, heads)] attr["surface"] = coref.mention_text(ment, sents).lower() set_head(attr, ment, sents, trees, heads) set_first_word(attr, ment, sents, trees, heads) set_ner(attr, ment, sner) attr["relaxed_surface"] = remove_phrase_after_head(attr, ment, sents, trees, heads) attr["word_list"] = extract_word_list(attr) attr["modifiers"] = extract_modifiers(attr, ment, sents, trees, heads) extract_properties(attr, ment, sents) set_speaker(attr, ment, speakers) attr["pleonastic"] = is_pleonastic(attr, ment, sents) doc_attrs[ment] = attr return doc_attrs
def init(doc_ments, sents, trees, heads, sner, speakers): doc_attrs = {} for sent_ments in doc_ments: for ment in sent_ments: attr = {} attr['type'] = my_constant.MAP_MTYPES[coref.mention_type( ment, sents, trees, heads)] attr['surface'] = coref.mention_text(ment, sents).lower() set_head(attr, ment, sents, trees, heads) set_first_word(attr, ment, sents, trees, heads) set_ner(attr, ment, sner) attr['relaxed_surface'] = remove_phrase_after_head( attr, ment, sents, trees, heads) attr['word_list'] = extract_word_list(attr) attr['modifiers'] = extract_modifiers(attr, ment, sents, trees, heads) extract_properties(attr, ment, sents) set_speaker(attr, ment, speakers) attr['pleonastic'] = is_pleonastic(attr, ment, sents) doc_attrs[ment] = attr return doc_attrs