def min_non_pronoun(cluster, text, parses, heads, check_head=False): ans = None for mention in cluster: if coreference.mention_type(mention, text, parses, heads) == 'pronoun': continue if check_head: head = coreference.mention_head(mention, text, parses, heads, default_last=True) if coreference.mention_type((mention[0], head[0][0], head[0][1]), text, parses, heads) == 'pronoun': continue if ans is None or ans > mention: ans = mention return ans
def cluster_error_properties(cluster, text, parses, heads, gold_doc): ans = [] # How big is the cluster ans.append(len(cluster)) # Counts of each type in the cluster counts = [0, 0, 0] for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'name': counts[0] += 1 elif mtype == 'nominal': counts[1] += 1 elif mtype == 'pronoun': counts[2] += 1 ans += counts # If it is one pronoun and something else, more info on the pronoun if counts[0] + counts[1] == 1 and counts[2] == 1: pronoun = None for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': pronoun = mention mtext = coreference_rendering.mention_text(text, pronoun).lower() ans.append(mtext) else: ans.append(None) # Number of cataphoric pronouns cataphora = 0 non_pronoun = min_non_pronoun(cluster, text, parses, heads, True) for mention in cluster: if mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': cataphora += 1 ans.append(cataphora) # NER types ner = set() for mention in cluster: if mention in gold_doc['ner']: ner.add(gold_doc['ner'][mention]) ner = list(ner) ner.sort() ans.append(ner) # Are all the mentions the same? mtext = set() for mention in cluster: mtext.add(coreference_rendering.mention_text(text, mention).lower()) ans.append(len(mtext) == 1) # Are all the heads the same? mhead = set() for mention in cluster: mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower()) ans.append(len(mhead) == 1) return ans
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc): ans = [] rest = cluster.difference({mention}) # Type of mention mtype = coreference.mention_type(mention, text, parses, heads) ans.append(mtype) # Text of mention mtext = coreference_rendering.mention_text(text, mention).lower() ans.append('_'.join(mtext.split())) # Does it have a string match with something in the cluster? matches = 'no_text_match' for omention in rest: otext = coreference_rendering.mention_text(text, omention).lower() if otext == mtext: matches = 'text_match' break ans.append(matches) # Does it have a head match with something in the cluster? matches = 'no_head_match' mhead = coreference.mention_head(mention, text, parses, heads)[1].lower() for omention in rest: ohead = coreference.mention_head(omention, text, parses, heads)[1].lower() if mhead == ohead: matches = 'head_match' break ans.append(matches) # Is it nested within another mention in the cluster nested = 'not_nested' for omention in rest: if omention[0] == mention[0]: if mention[1] < omention[1] and omention[2] < mention[2]: if nested == 'nested_inside': nested = 'nested_both' break else: nested = 'nested_outside' if omention[1] < mention[1] and mention[2] < omention[2]: if nested == 'nested_outside': nested = 'nested_both' break else: nested = 'nested_inside' ans.append(nested) # Was it first in the cluster? ans.append(mention == min(cluster)) # Was it last in the cluster? ans.append(mention == max(cluster)) # Is it a case of cataphora? non_pronoun = min_non_pronoun(cluster, text, parses, heads) ans.append(non_pronoun is not None and mention < non_pronoun) # Do NER, number, person, or gender of mention and cluster match? cluster_properties = get_cluster_info(rest, gold_doc) mention_properties = get_cluster_info({mention}, gold_doc) words = ['ner', 'number', 'person', 'gender'] for i in xrange(4): if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0: ans.append(words[i] + '_unknown') elif len(mention_properties[i].intersection(cluster_properties[i])) > 0: ans.append(words[i] + '_matches') else: ans.append(words[i] + '_does_not_match') return ans
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, text, parses, heads): changed = set() # Apply changes for cases where the difference is only leading or trailing punctuation mapping = {} used_gold = set() unique_to_gold = gold_mention_set.difference(auto_mention_set) unique_to_auto = auto_mention_set.difference(gold_mention_set) for amention in unique_to_auto: sentence, astart, aend = amention while (astart < aend - 1 and (text[sentence][astart] == "the" or (len(text[sentence][astart]) == 1 and text[sentence][astart][0] not in string.letters))): astart += 1 while (astart < aend - 1 and (text[sentence][aend - 1] == "'s" or (len(text[sentence][aend - 1]) == 1 and text[sentence][aend - 1][0] not in string.letters))): aend -= 1 for gmention in unique_to_gold: if gmention in used_gold: continue gsentence, gstart, gend = gmention if sentence != gsentence: continue while (gstart < gend - 1 and (text[sentence][gstart] == "the" or (len(text[sentence][gstart]) == 1 and text[sentence][gstart][0] not in string.letters))): gstart += 1 while (gstart < gend - 1 and (text[sentence][gend - 1] == "'s" or (len(text[sentence][gend - 1]) == 1 and text[sentence][gend - 1][0] not in string.letters))): gend -= 1 if astart == gstart and aend == gend: mapping[amention] = gmention used_gold.add(gmention) # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) changed.add((mention, mapping[mention])) # Create a mapping based on heads head_dict = defaultdict(lambda: {'auto': [], 'gold': []}) for mention in auto_mention_set.difference(gold_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) # This will default to last word if the mention is not a constituent, is # there an alternative? if head is not None: head = (mention[0], head[0]) head_dict[head]['auto'].append(mention) for mention in gold_mention_set.difference(auto_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) if head is not None: head = (mention[0], head[0]) head_dict[head]['gold'].append(mention) mapping = {} for head in head_dict: amentions = head_dict[head]['auto'] gmentions = head_dict[head]['gold'] if len(amentions) == 1 and len(gmentions) == 1: mapping[amentions[0]] = gmentions[0] # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) changed.add((mention, mapping[mention])) # Add notes nchanges = [] for smention, gmention in changed: properties = [smention, gmention] pre_extra_text = None pre_missing_text = None post_extra_text = None post_missing_text = None pre_extra_nodes = None pre_missing_nodes = None post_extra_nodes = None post_missing_nodes = None if smention[1] < gmention[1]: pre_extra_text = ' '.join(text[smention[0]][smention[1]:gmention[1]]).lower() nodes = parses[gmention[0]].get_spanning_nodes(smention[1], gmention[1]) pre_extra_nodes = ' '.join([node.label for node in nodes]) if smention[1] > gmention[1]: pre_missing_text = ' '.join(text[smention[0]][gmention[1]:smention[1]]).lower() nodes = parses[gmention[0]].get_spanning_nodes(gmention[1], smention[1]) pre_missing_nodes = ' '.join([node.label for node in nodes]) if smention[2] < gmention[2]: post_missing_text = ' '.join(text[smention[0]][smention[2]:gmention[2]]).lower() nodes = parses[gmention[0]].get_spanning_nodes(smention[2], gmention[2]) post_missing_nodes = ' '.join([node.label for node in nodes]) if smention[2] > gmention[2]: post_extra_text = ' '.join(text[smention[0]][gmention[2]:smention[2]]).lower() nodes = parses[gmention[0]].get_spanning_nodes(gmention[2], smention[2]) post_extra_nodes = ' '.join([node.label for node in nodes]) snode = parses[smention[0]].get_nodes('lowest', smention[1], smention[2]) properties.append("in the parse" if snode is not None else "not in the parse") properties.append(pre_extra_text) properties.append(pre_missing_text) properties.append(post_extra_text) properties.append(post_missing_text) properties.append(pre_extra_nodes) properties.append(pre_missing_nodes) properties.append(post_extra_nodes) properties.append(post_missing_nodes) nchanges.append(tuple(properties)) return nchanges
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc): ans = [] rest = cluster.difference(part) # Size of part ans.append(len(part)) # 0 # Size of rest ans.append(len(rest)) # 1 # If size 1, what the text is mtext = None if len(part) == 1: mention = iter(part).next() mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split()) ans.append(mtext) # 2 # Does this part have any cataphoric pronouns count = 0 acluster = set() for mention in cluster: if mention in auto_mentions: acluster.add(mention) non_pronoun = min_non_pronoun(acluster, text, parses, heads) if non_pronoun is not None and non_pronoun not in part: for mention in part: if mention in auto_mentions and mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': count += 1 ans.append("%d_cataphoric" % count) # Number of pronouns, nominals, names present in it type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in part: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 3 ans.append(type_counts['nominal']) # 4 ans.append(type_counts['pronoun']) # 5 # Number of pronouns, nominals, names, in rest type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in rest: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 6 ans.append(type_counts['nominal']) # 7 ans.append(type_counts['pronoun']) # 8 # Whether this is extra all_extra = True for mention in part: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 9 # Whether the rest is all extra all_extra = True for mention in rest: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 10 # Whether there is an exact string match between a mention in the part and cluster (excluding pronouns) match_present = 'no_string_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue stext = coreference_rendering.mention_text(text, smention).lower() rtext = coreference_rendering.mention_text(text, rmention).lower() if stext == rtext: match_present = 'string_match' break if 'no' not in match_present: break ans.append(match_present) # 11 # Whether there is a head match between a mention in the part and cluster (excluding pronouns) match_present = 'no_head_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue shead = coreference.mention_head(smention, text, parses, heads)[1].lower() rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower() if shead == rhead: match_present = 'head_match' break if 'no' not in match_present: break ans.append(match_present) # 12 # What has happened, or will happen example = iter(part).next() action = 'nothing' if example not in gold_mentions: action = 'delete' elif part != set(gold_clusters[gold_mentions[example]]): action = 'merge' ans.append(action) # 13 action = 'nothing' if example not in auto_mentions: action = 'introduce' else: for acluster in auto: if example in acluster: if acluster != part: action = 'split' break ans.append(action) # 14 # NER, number, person, gender cproperties = get_cluster_info(rest, gold_doc) pproperties = get_cluster_info(part, gold_doc) for prop in xrange(4): ans.append(cproperties[prop] == pproperties[prop]) cprop = list(cproperties[prop]) cprop.sort() pprop = list(pproperties[prop]) pprop.sort() ans.append('part_' + '_'.join(pprop)) ans.append('cluster_' + '_'.join(cprop)) return ans
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, auto_cluster_set, text, parses, heads): # Apply changes for cases where the difference is only leading or trailing # punctuation mapping = {} used_gold = set() unique_to_gold = gold_mention_set.difference(auto_mention_set) unique_to_auto = auto_mention_set.difference(gold_mention_set) for amention in unique_to_auto: sentence, astart, aend = amention while (aend - astart > 1 and (text[sentence][astart] == "the" or (len(text[sentence][astart]) == 1 and text[sentence][astart][0] not in string.ascii_letters))): astart += 1 while (aend - astart > 1 and (text[sentence][aend - 1] == "'s" or (len(text[sentence][aend - 1]) == 1 and text[sentence][aend - 1][0] not in string.ascii_letters))): aend -= 1 for gmention in unique_to_gold: gsentence, gstart, gend = gmention if sentence != gsentence or gmention in used_gold: continue while (gend - gstart > 1 and (text[sentence][gstart] == "the" or (len(text[sentence][gstart]) == 1 and text[sentence][gstart][0] not in string.ascii_letters))): gstart += 1 while (gend - gstart > 1 and (text[sentence][gend - 1] == "'s" or (len(text[sentence][gend - 1]) == 1 and text[sentence][gend - 1][0] not in string.ascii_letters))): gend -= 1 if astart == gstart and aend == gend: mapping[amention] = gmention used_gold.add(gmention) # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) to_remove = None for cluster in auto_cluster_set: if mention in cluster: to_remove = cluster auto_cluster_set.remove(to_remove) ncluster = [] for mention2 in to_remove: if mention2 == mention: mention2 = mapping[mention] ncluster.append(mention2) ncluster = tuple(ncluster) auto_cluster_set.add(ncluster) # Create a mapping based on heads head_dict = defaultdict(lambda: {'auto': [], 'gold': []}) for mention in auto_mention_set.difference(gold_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) # This will default to last word if the mention is not a constituent, # is there an alternative? if head is not None: head = (mention[0], head[0]) head_dict[head]['auto'].append(mention) for mention in gold_mention_set.difference(auto_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) if head is not None: head = (mention[0], head[0]) head_dict[head]['gold'].append(mention) mapping = {} for head in head_dict: amentions = head_dict[head]['auto'] gmentions = head_dict[head]['gold'] if len(amentions) == 1 and len(gmentions) == 1: mapping[amentions[0]] = gmentions[0] # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) to_remove = None for cluster in auto_cluster_set: if mention in cluster: to_remove = cluster auto_cluster_set.remove(to_remove) ncluster = [] for mention2 in to_remove: if mention2 == mention: mention2 = mapping[mention] ncluster.append(mention2) ncluster = tuple(ncluster) auto_cluster_set.add(ncluster)