def get_cluster_info(cluster, gold_doc): text = gold_doc['text'] gold_ner = gold_doc['ner'] ner, number, person, gender = set(), set(), set(), set() for mention in cluster: mtext = coreference_rendering.mention_text(text, mention).lower() tgender, tnumber, tperson = coreference.pronoun_properties_text(mtext) if tgender != 'unknown': gender.add(tgender) if tnumber != 'unknown': number.add(tnumber) if tperson != 'unknown': person.add(tperson) if mention in gold_ner: ner.add(gold_ner[mention]) return ner, number, person, gender
def print_pre_change_info(out, auto, gold, auto_mentions, gold_mention_set, text, parses, heads, gold_clusters, gold_mentions, gold_doc, auto_clusters): # Cataphora mentions = defaultdict(lambda: [None, None, None]) for cluster in gold: non_pronoun = min_non_pronoun(cluster, text, parses, heads) for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': if non_pronoun is not None and mention < non_pronoun: mentions[mention][0] = True else: mentions[mention][0] = False for cluster in auto: non_pronoun = min_non_pronoun(cluster, text, parses, heads) for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': if non_pronoun is not None and mention < non_pronoun: mentions[mention][1] = True else: mentions[mention][1] = False in_both = [] for mention in mentions: if mentions[mention][0] and mentions[mention][1]: in_both.append(mention) for mention in in_both: acluster = auto_clusters[auto_mentions[mention]] gcluster = gold_clusters[gold_mentions[mention]] anon_pronoun = min_non_pronoun(acluster, text, parses, heads) gnon_pronoun = min_non_pronoun(gcluster, text, parses, heads) if anon_pronoun == gnon_pronoun: mentions[mention][2] = True else: mentions[mention][2] = False for mention in mentions: mtext = coreference_rendering.mention_text(text, mention).lower() print >> out['out'], "Cataphoric properties", mentions[mention], mtext
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc): ans = [] rest = cluster.difference({mention}) # Type of mention mtype = coreference.mention_type(mention, text, parses, heads) ans.append(mtype) # Text of mention mtext = coreference_rendering.mention_text(text, mention).lower() ans.append('_'.join(mtext.split())) # Does it have a string match with something in the cluster? matches = 'no_text_match' for omention in rest: otext = coreference_rendering.mention_text(text, omention).lower() if otext == mtext: matches = 'text_match' break ans.append(matches) # Does it have a head match with something in the cluster? matches = 'no_head_match' mhead = coreference.mention_head(mention, text, parses, heads)[1].lower() for omention in rest: ohead = coreference.mention_head(omention, text, parses, heads)[1].lower() if mhead == ohead: matches = 'head_match' break ans.append(matches) # Is it nested within another mention in the cluster nested = 'not_nested' for omention in rest: if omention[0] == mention[0]: if mention[1] < omention[1] and omention[2] < mention[2]: if nested == 'nested_inside': nested = 'nested_both' break else: nested = 'nested_outside' if omention[1] < mention[1] and mention[2] < omention[2]: if nested == 'nested_outside': nested = 'nested_both' break else: nested = 'nested_inside' ans.append(nested) # Was it first in the cluster? ans.append(mention == min(cluster)) # Was it last in the cluster? ans.append(mention == max(cluster)) # Is it a case of cataphora? non_pronoun = min_non_pronoun(cluster, text, parses, heads) ans.append(non_pronoun is not None and mention < non_pronoun) # Do NER, number, person, or gender of mention and cluster match? cluster_properties = get_cluster_info(rest, gold_doc) mention_properties = get_cluster_info({mention}, gold_doc) words = ['ner', 'number', 'person', 'gender'] for i in xrange(4): if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0: ans.append(words[i] + '_unknown') elif len(mention_properties[i].intersection(cluster_properties[i])) > 0: ans.append(words[i] + '_matches') else: ans.append(words[i] + '_does_not_match') return ans
def cluster_error_properties(cluster, text, parses, heads, gold_doc): ans = [] # How big is the cluster ans.append(len(cluster)) # Counts of each type in the cluster counts = [0, 0, 0] for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'name': counts[0] += 1 elif mtype == 'nominal': counts[1] += 1 elif mtype == 'pronoun': counts[2] += 1 ans += counts # If it is one pronoun and something else, more info on the pronoun if counts[0] + counts[1] == 1 and counts[2] == 1: pronoun = None for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': pronoun = mention mtext = coreference_rendering.mention_text(text, pronoun).lower() ans.append(mtext) else: ans.append(None) # Number of cataphoric pronouns cataphora = 0 non_pronoun = min_non_pronoun(cluster, text, parses, heads, True) for mention in cluster: if mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': cataphora += 1 ans.append(cataphora) # NER types ner = set() for mention in cluster: if mention in gold_doc['ner']: ner.add(gold_doc['ner'][mention]) ner = list(ner) ner.sort() ans.append(ner) # Are all the mentions the same? mtext = set() for mention in cluster: mtext.add(coreference_rendering.mention_text(text, mention).lower()) ans.append(len(mtext) == 1) # Are all the heads the same? mhead = set() for mention in cluster: mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower()) ans.append(len(mhead) == 1) return ans
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc): ans = [] rest = cluster.difference(part) # Size of part ans.append(len(part)) # 0 # Size of rest ans.append(len(rest)) # 1 # If size 1, what the text is mtext = None if len(part) == 1: mention = iter(part).next() mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split()) ans.append(mtext) # 2 # Does this part have any cataphoric pronouns count = 0 acluster = set() for mention in cluster: if mention in auto_mentions: acluster.add(mention) non_pronoun = min_non_pronoun(acluster, text, parses, heads) if non_pronoun is not None and non_pronoun not in part: for mention in part: if mention in auto_mentions and mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': count += 1 ans.append("%d_cataphoric" % count) # Number of pronouns, nominals, names present in it type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in part: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 3 ans.append(type_counts['nominal']) # 4 ans.append(type_counts['pronoun']) # 5 # Number of pronouns, nominals, names, in rest type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in rest: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 6 ans.append(type_counts['nominal']) # 7 ans.append(type_counts['pronoun']) # 8 # Whether this is extra all_extra = True for mention in part: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 9 # Whether the rest is all extra all_extra = True for mention in rest: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 10 # Whether there is an exact string match between a mention in the part and cluster (excluding pronouns) match_present = 'no_string_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue stext = coreference_rendering.mention_text(text, smention).lower() rtext = coreference_rendering.mention_text(text, rmention).lower() if stext == rtext: match_present = 'string_match' break if 'no' not in match_present: break ans.append(match_present) # 11 # Whether there is a head match between a mention in the part and cluster (excluding pronouns) match_present = 'no_head_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue shead = coreference.mention_head(smention, text, parses, heads)[1].lower() rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower() if shead == rhead: match_present = 'head_match' break if 'no' not in match_present: break ans.append(match_present) # 12 # What has happened, or will happen example = iter(part).next() action = 'nothing' if example not in gold_mentions: action = 'delete' elif part != set(gold_clusters[gold_mentions[example]]): action = 'merge' ans.append(action) # 13 action = 'nothing' if example not in auto_mentions: action = 'introduce' else: for acluster in auto: if example in acluster: if acluster != part: action = 'split' break ans.append(action) # 14 # NER, number, person, gender cproperties = get_cluster_info(rest, gold_doc) pproperties = get_cluster_info(part, gold_doc) for prop in xrange(4): ans.append(cproperties[prop] == pproperties[prop]) cprop = list(cproperties[prop]) cprop.sort() pprop = list(pproperties[prop]) pprop.sort() ans.append('part_' + '_'.join(pprop)) ans.append('cluster_' + '_'.join(cprop)) return ans