def sort_mentions_for_pronoun(ment, ant_list, ment_attr, sents, trees, heads): head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) node = trees[ment[0]].get_nodes('lowest', head_span[0], head_span[1]) if node is None: return ant_list tree = node nodes_to_keep = {} idx = 0 while tree is not None: if tree.label.startswith('S') or tree.parent is None: for a in ant_list: if (a not in nodes_to_keep and tree.span[0] <= a[1] and a[2] <= tree.span[1]): nodes_to_keep[a] = idx idx += 1 tree = tree.parent sorted_ant_list = [x[0] for x in \ sorted(nodes_to_keep.items(), key=lambda x: x[1])] if len(sorted_ant_list) != len(ant_list): logger.error("Inconsistent size 'ant_list': %d != %d" % (len(sorted_ant_list), len(ant_list))) sys.exit(1) return sorted_ant_list
def check_ordered_mentions(self): # Check all mentions are in tree-traversal order data = self.data for doc in data: for part in data[doc]: sents = data[doc][part]['text'] trees = data[doc][part]['parses'] heads = data[doc][part]['heads'] for i in xrange(len(sents)): sent_ments = data[doc][part]['doc_mentions'][i] idx = 0 tree_order = {} for node in trees[i]: tree_order[node.span] = idx idx += 1 prev_idx = -1 for ment in sent_ments: print True if (ment[0] == i) else False span = (ment[1], ment[2]) idx = -1 if span in tree_order: idx = tree_order[span] else: head_span, head_word, head_pos = \ coreference.mention_head( ment, sents, trees, heads) idx = tree_order[head_span] print True if (idx >= prev_idx) else False prev_idx = idx
def extract_named_entitiy_mentions(clusters_by_head, sent_idx, sents, trees, heads, sner): found = False for ment in sner[sent_idx].keys(): head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) clusters_by_head[head_span][ment] = True
def remove_phrase_after_head(attr, ment, sents, trees, heads): comma_idx = -1 wh_idx = -1 sent_idx, start_idx, end_idx = ment head_idx = attr["head_idx"] surface = attr["surface"] if head_idx + 2 > end_idx: # Unlikely to have a pharse return surface tmp = (sent_idx, head_idx + 1, head_idx + 2) span, word, pos = coref.mention_head(tmp, sents, trees, heads) if comma_idx == -1 and pos == ",": comma_idx = head_idx + 1 if wh_idx == -1 and pos.startswith("W"): wh_idx = head_idx + 1 ret = surface if comma_idx != -1 and head_idx < comma_idx: ret = " ".join(sents[sent_idx][start_idx:comma_idx]) if comma_idx == -1 and wh_idx != -1 and head_idx < wh_idx: ret = " ".join(sents[sent_idx][start_idx:wh_idx]) return ret.lower()
def sort_mentions_for_pronoun(ment, ant_list, ment_attr, sents, trees, heads): head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) node = trees[ment[0]].get_nodes('lowest', head_span[0], head_span[1]) if node is None: return ant_list tree = node nodes_to_keep = {} idx = 0 while tree is not None: if tree.label.startswith('S') or tree.parent is None: for a in ant_list: if a not in nodes_to_keep and \ tree.span[0] <= a[1] and a[2] <= tree.span[1]: nodes_to_keep[a] = idx idx += 1 tree = tree.parent sorted_ant_list = [x[0] for x in \ sorted(nodes_to_keep.items(), key=lambda x: x[1])] if len(sorted_ant_list) != len(ant_list): logger.error("Inconsistent size 'ant_list': %d != %d" % (len(sorted_ant_list), len(ant_list))) sys.exit(1) return sorted_ant_list
def check_ordered_mentions(self): # Check all mentions are in tree-traversal order data = self.data for doc in data: for part in data[doc]: sents = data[doc][part]['text'] trees = data[doc][part]['parses'] heads = data[doc][part]['heads'] for i in xrange(len(sents)): sent_ments = data[doc][part]['doc_mentions'][i] idx = 0 tree_order = {} for node in trees[i]: tree_order[node.span] = idx idx += 1 prev_idx = -1 for ment in sent_ments: print True if (ment[0] == i) else False span = (ment[1], ment[2]) idx = -1 if span in tree_order: idx = tree_order[span] else: head_span, head_word, head_pos = \ coreference.mention_head( ment, sents, trees, heads) idx = tree_order[head_span] print True if (idx == prev_idx) else False prev_idx = idx
def remove_phrase_after_head(attr, ment, sents, trees, heads): comma_idx = -1 wh_idx = -1 sent_idx, start_idx, end_idx = ment head_idx = attr['head_idx'] surface = attr['surface'] if head_idx + 2 > end_idx: # Unlikely to have a pharse return surface tmp = (sent_idx, head_idx + 1, head_idx + 2) span, word, pos = coref.mention_head(tmp, sents, trees, heads) if comma_idx == -1 and pos == ',': comma_idx = head_idx + 1 if wh_idx == -1 and pos.startswith('W'): wh_idx = head_idx + 1 ret = surface if comma_idx != -1 and head_idx < comma_idx: ret = ' '.join(sents[sent_idx][start_idx:comma_idx]) if comma_idx == -1 and wh_idx != -1 and head_idx < wh_idx: ret = ' '.join(sents[sent_idx][start_idx:wh_idx]) return ret.lower()
def remove_conll_spurious_mentions(ments, sents, trees, heads, sner, params): ments_to_remove = [] for ment in ments.keys(): if is_generic(ment, sents, trees, heads): ments_to_remove.append(ment) surface = coref.mention_text(ment, sents).lower() if (ment in sner[ment[0]] and sner[ment[0]][ment] == 'GPE' and surface in dictionaries.gpe_acronyms): ments_to_remove.append(ment) if surface in dictionaries.stop_words: ments_to_remove.append(ment) if start_with_stop_prefixes(surface): ments_to_remove.append(ment) if end_with_stop_suffixes(surface): ments_to_remove.append(ment) head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) tmp = (ment[0], head_span[0], head_span[1]) if (tmp in sner[ment[0]] and sner[ment[0]][tmp] in {'PERCENT', 'MONEY'}): ments_to_remove.append(ment) for r in ments_to_remove: if r in ments: ments.pop(r)
def is_generic(ment, sents, trees, heads): surface_len = ment[2] - ment[1] head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) first_span, first_word, first_pos = \ coref.mention_head((ment[0], ment[1], ment[1] + 1), sents, trees, heads) if (head_pos == 'NN' and head_word not in dictionaries.temporals and (surface_len == 1 or first_pos in {'JJ', 'RB'})): return True if first_word in dictionaries.quantifiers: return True return False
def keep_subpatterns(nodes_to_keep, head_span, ments, largest, sents, trees, heads): for ment in ments: if largest[1] == ment[1] and ment[2] == largest[2]: continue if largest[1] == ment[1] and ment[2] < largest[2]: tmp = (ment[0], ment[2], ment[2] + 1) span, word, pos = coref.mention_head(tmp, sents, trees, heads) if pos == ',' or pos == 'CC': nodes_to_keep[ment] = True if largest[1] < ment[1] and ment[2] == largest[2]: tmp = (ment[0], ment[1] - 1, ment[1]) span, word, pos = coref.mention_head(tmp, sents, trees, heads) if pos == 'CC': nodes_to_keep[ment] = True
def extract_parse_mentions(clusters_by_head, tree_order, i, sents, trees, heads, sner): idx = 0 for node in trees[i]: if node.label in my_constant.PARSE_TYPES_TO_KEEP: ment = (i, node.span[0], node.span[1]) if not inside_named_entities(ment, sner[i]): head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) clusters_by_head[head_span][ment] = True tree_order[node.span] = idx idx += 1
def extract_modifiers(attr, ment, sents, trees, heads): ret = set() for i in xrange(ment[1], ment[2]): span, word, pos = \ coref.mention_head((ment[0], i, i + 1), sents, trees, heads) word = word.lower() if (not (pos.startswith('N') or pos.startswith('V') or pos.startswith('JJ') or pos == 'CD') or word == attr['head_word']): continue ret.add(word) return ret
def extract_modifiers(attr, ment, sents, trees, heads): ret = set() for i in xrange(ment[1], ment[2]): span, word, pos = coref.mention_head((ment[0], i, i + 1), sents, trees, heads) word = word.lower() if ( not (pos.startswith("N") or pos.startswith("V") or pos.startswith("JJ") or pos == "CD") or word == attr["head_word"] ): continue ret.add(word) return ret
def sort_by_tree_traversal_order(nodes_to_keep, tree_order, sents, trees, heads): for node in nodes_to_keep.keys(): span = (node[1], node[2]) if span in tree_order: nodes_to_keep[node] = tree_order[span] else: head_span, head_word, head_pos = \ coref.mention_head(node, sents, trees, heads) if head_span in tree_order: nodes_to_keep[node] = tree_order[head_span] else: logger.error("Cannot find node '(%d, %d, %d)' in tree" % (node[0], node[1], node[2])) sys.exit(1) return [x[0] for x in sorted(nodes_to_keep.items(), key=lambda x: x[1])]
def set_first_word(attr, ment, sents, trees, heads): ment_ = (ment[0], ment[1], ment[1] + 1) first_span, first_word, first_pos = coref.mention_head(ment_, sents, trees, heads) attr["first_word"] = first_word.lower() attr["first_pos"] = first_pos
def set_first_word(attr, ment, sents, trees, heads): ment_ = (ment[0], ment[1], ment[1] + 1) first_span, first_word, first_pos = \ coref.mention_head(ment_, sents, trees, heads) attr['first_word'] = first_word.lower() attr['first_pos'] = first_pos
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, auto_cluster_set, text, parses, heads): # Apply changes for cases where the difference is only leading or trailing punctuation mapping = {} used_gold = set() unique_to_gold = gold_mention_set.difference(auto_mention_set) unique_to_auto = auto_mention_set.difference(gold_mention_set) for amention in unique_to_auto: sentence, astart, aend = amention while (aend - astart > 1 and (text[sentence][astart] == "the" or (len(text[sentence][astart]) == 1 and text[sentence][astart][0] not in string.letters))): astart += 1 while (aend - astart > 1 and (text[sentence][aend - 1] == "'s" or (len(text[sentence][aend - 1]) == 1 and text[sentence][aend - 1][0] not in string.letters))): aend -= 1 for gmention in unique_to_gold: gsentence, gstart, gend = gmention if sentence != gsentence or gmention in used_gold: continue while (gend - gstart > 1 and (text[sentence][gstart] == "the" or (len(text[sentence][gstart]) == 1 and text[sentence][gstart][0] not in string.letters))): gstart += 1 while (gend - gstart > 1 and (text[sentence][gend - 1] == "'s" or (len(text[sentence][gend - 1]) == 1 and text[sentence][gend - 1][0] not in string.letters))): gend -= 1 if astart == gstart and aend == gend: mapping[amention] = gmention used_gold.add(gmention) # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) to_remove = None for cluster in auto_cluster_set: if mention in cluster: to_remove = cluster auto_cluster_set.remove(to_remove) ncluster = [] for mention2 in to_remove: if mention2 == mention: mention2 = mapping[mention] ncluster.append(mention2) ncluster = tuple(ncluster) auto_cluster_set.add(ncluster) # Create a mapping based on heads head_dict = defaultdict(lambda: {'auto': [], 'gold': []}) for mention in auto_mention_set.difference(gold_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) # This will default to last word if the mention is not a constituent, is # there an alternative? if head is not None: head = (mention[0], head[0]) head_dict[head]['auto'].append(mention) for mention in gold_mention_set.difference(auto_mention_set): sentence, start, end = mention head = coreference.mention_head(mention, text, parses, heads, default_last=True) if head is not None: head = (mention[0], head[0]) head_dict[head]['gold'].append(mention) mapping = {} for head in head_dict: amentions = head_dict[head]['auto'] gmentions = head_dict[head]['gold'] if len(amentions) == 1 and len(gmentions) == 1: mapping[amentions[0]] = gmentions[0] # Apply mapping to create new auto_mention_set for mention in mapping: auto_mention_set.remove(mention) auto_mention_set.add(mapping[mention]) cluster_id = auto_mentions.pop(mention) auto_mentions[mapping[mention]] = cluster_id auto_clusters[cluster_id].remove(mention) auto_clusters[cluster_id].append(mapping[mention]) to_remove = None for cluster in auto_cluster_set: if mention in cluster: to_remove = cluster auto_cluster_set.remove(to_remove) ncluster = [] for mention2 in to_remove: if mention2 == mention: mention2 = mapping[mention] ncluster.append(mention2) ncluster = tuple(ncluster) auto_cluster_set.add(ncluster)
def set_head(attr, ment, sents, trees, heads): head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) attr['head_idx'] = head_span[0] attr['head_word'] = head_word.lower() attr['head_pos'] = head_pos
def set_head(attr, ment, sents, trees, heads): head_span, head_word, head_pos = coref.mention_head(ment, sents, trees, heads) attr["head_idx"] = head_span[0] attr["head_word"] = head_word.lower() attr["head_pos"] = head_pos