def __get_span_to_id(column): span_to_id = {} ids_to_stack = defaultdict(list) for i in range(0, len(column)): entry = column[i] if entry != "-": parallel_annotations = entry.split("|") for annotation in parallel_annotations: if annotation.startswith("(") and annotation.endswith(")"): set_id = annotation[1:-1] span_to_id[spans.Span(i, i)] = int(set_id) elif annotation.startswith("("): set_id = annotation[1:] ids_to_stack[set_id].append(i) elif annotation.endswith(")"): set_id = annotation[:-1] span_to_id[ spans.Span(ids_to_stack[set_id].pop(), i) ] = int(set_id) return span_to_id
def __extract_sentence_spans(in_sentence_ids): sentence_spans = [] span_start = 0 for i in range(1, len(in_sentence_ids)): if in_sentence_ids[i] <= in_sentence_ids[i - 1]: sentence_spans.append(spans.Span(span_start, i - 1)) span_start = i sentence_spans.append(spans.Span(span_start, len(in_sentence_ids) - 1)) return sentence_spans
def get_relevant_subtree(span, document): """ Get the fragment of the parse tree and the input span. Args: span (Span): A span in a document. document (CoNLLDocument): A document. Returns: nltk.ParentedTree: The fragment of the parse tree at the span in the document. """ in_sentence_ids = document.in_sentence_ids[span.begin:span.end + 1] in_sentence_span = spans.Span(in_sentence_ids[0], in_sentence_ids[-1]) sentence_id, sentence_span = document.get_sentence_id_and_span(span) sentence_tree = document.parse[sentence_id] spanning_leaves = sentence_tree.treeposition_spanning_leaves( in_sentence_span.begin, in_sentence_span.end + 1) mention_subtree = sentence_tree[spanning_leaves] if mention_subtree in sentence_tree.leaves(): mention_subtree = sentence_tree[spanning_leaves[:-2]] return mention_subtree
def __init__(self, identifier, sentences, coref): """ Construct a document from sentence and coreference information. Args: identifier (str): A unique identifier for the document. sentences(list): A list of sentence information. The ith item contains information about the ith sentence. We assume that each ``sentences[i]`` is a 6-tuple ``tokens, pos, ner, speakers, parse, dep``, where * tokens (list(str)): All tokens in the sentence. * pos (list(str)): All part-of-speech tags in the sentence. * ner (list(str)): All named entity tags in the sentence (if a token does not have a tag, the tag is set to NONE). * speakers (list(str)): All speaker ids in the sentence. * parse (str): A string representation of the sentence's parse tree (should be readable by nltk) * dep (list(StanfordDependencies.CoNLL.Token): All dependencies in the sentence represented as lists of tokens with label information and pointers to heads. coref (dict(span, int)): A mapping of mention spans to their coreference set id. """ self.identifier = identifier self.in_sentence_ids = [] self.sentence_spans = [] self.tokens = [] self.pos = [] self.ner = [] self.parse = [] self.dep = [] self.speakers = [] self.coref = coref for sentence in sentences: tokens, pos, ner, speakers, parse, dep = sentence offset = len(self.tokens) self.in_sentence_ids += list(range(0, len(tokens))) self.sentence_spans.append(spans.Span( offset, offset + len(tokens) - 1 )) self.tokens += tokens self.pos += pos self.ner += ner self.parse.append(nltk.ParentedTree.fromstring(parse)) self.dep.append(dep) self.speakers += speakers self.annotated_mentions = self.__get_annotated_mentions() self.system_mentions = []
def test_adjust_head_for_nam(self): self.assertEqual( (spans.Span(0, 1), ["Khan", "Younes"]), head_finders.HeadFinder.adjust_head_for_nam( ["Khan", "Younes", "in", "the", "southern", "Ghaza", "Strip"], ["NNP", "NNS", "IN", "DT", "JJ", "NNP", "NNP"], "GPE")) self.assertEqual( (spans.Span(0, 1), ["Walter", "Sisulu"]), head_finders.HeadFinder.adjust_head_for_nam(["Walter", "Sisulu"], ["NNP", "NNP"], "PERSON")) self.assertEqual( (spans.Span(1, 5), ['vice', 'president', 'Robert', 'W.', 'Reedy']), head_finders.HeadFinder.adjust_head_for_nam( ['former', 'vice', 'president', 'Robert', 'W.', 'Reedy'], ["JJ", "NN", "NN", "NNP", "NNP", "NNP"], "PERSON")) self.assertEqual( (spans.Span(0, 1), ['Michael', 'Wolf']), head_finders.HeadFinder.adjust_head_for_nam( ['Michael', 'Wolf', ',', 'a', 'contributing', 'editor'], ["NNP", "NNP", ",", "DT", "VBG", "NN"], "PERSON")) self.assertEqual( (spans.Span(0, 1), ['Mr.', 'Clinton']), head_finders.HeadFinder.adjust_head_for_nam(['Mr.', 'Clinton'], ["NNP", "NNP"], "NONE")) self.assertEqual( (spans.Span(0, 0), ['Taiwan']), head_finders.HeadFinder.adjust_head_for_nam(['Taiwan', "'s"], ["NNP", "POS"], "GPE")) self.assertEqual( (spans.Span(0, 2), ["Jim", "'s", "Tools"]), head_finders.HeadFinder.adjust_head_for_nam(['Jim', "'s", "Tools"], ["NNP", "POS", "NNP"], "ORG")) self.assertEqual( (spans.Span(0, 3), ["Taiwan", "'s", "False", "Cypresses"]), head_finders.HeadFinder.adjust_head_for_nam( ["Taiwan", "'s", "False", "Cypresses"], ["NNP", "POS", "JJ", "NNP"], "NONE"))
def __extract_system_mention_spans(document): mention_spans = [] for i, sentence_span in enumerate(document.sentence_spans): sentence_tree = document.parse[i] in_sentence_spans = __extract_mention_spans_for_sentence( sentence_tree, document.ner[sentence_span.begin:sentence_span.end + 1]) mention_spans += [ spans.Span(sentence_span.begin + span.begin, sentence_span.begin + span.end) for span in in_sentence_spans ] return sorted(mention_spans)
def __get_in_tree_span(parented_tree): start = 0 current_tree = parented_tree while current_tree.parent() is not None: for child in current_tree.parent(): if child == current_tree: break else: start += len(child.leaves()) current_tree = current_tree.parent() end = start + len(parented_tree.leaves()) - 1 return spans.Span(start, end)
def get_modifier(mention): head_span_in_mention = spans.Span( mention.attributes["head_span"].begin - mention.span.begin, mention.attributes["head_span"].end - mention.span.begin) modifiers = set() for index, (token, pos) in enumerate( zip(mention.attributes["tokens"], mention.attributes["pos"])): if (token.lower() not in ["the", "this", "that", "those", "these", "a", "an"] and pos not in ["POS", "IN"] and (index < head_span_in_mention.begin or index > head_span_in_mention.end)): modifiers.add(token.lower()) return modifiers
def __get_span_from_ner(pos, ner): i = 0 spans_from_ner = [] while i < len(ner): current_tag = ner[i] if current_tag != "NONE": start = i while i + 1 < len(ner) and ner[i + 1] != "NONE" and ner[i] == ner[ i + 1]: i += 1 if i + 1 < len(pos) and pos[i + 1] == "POS": i += 1 spans_from_ner.append(spans.Span(start, i)) i += 1 return sorted(spans_from_ner)
def __generate_html_for_errors(self, document, mentions): document_html = "\n\t\t\t<ol class=\"text\">\n" \ "\t\t\t\t<li class=\"sentence\">" self.navi["gold"] = "\n\t\t\t\t<div class=\"goldNavi\">" \ "<h3>Reference Entities</h3>" \ "<span class=\"tease\">show all</span>" \ "\n\t\t\t\t\t<ul>" self.navi["system"] = "\n\t\t\t\t<div class=\"systemNavi\">" \ "<h3>System Entities</h3>" \ "<span class=\"tease\">show all</span>" \ "\n\t\t\t\t\t<ul>" chains = set() index = 0 sentence_id, sentence_span = document.get_sentence_id_and_span( spans.Span(0, 0)) annotated_mentions = set(document.annotated_mentions) for token in document.tokens: token = html_escape(token, True) mention_id = 0 mention_text = "" processed_gold_mentions = set() for mention in mentions: if mention.span.begin > index: break if mention.span.end < index: mention_id += 1 continue mention_tokens = html_escape( " ".join(mention.attributes['tokens']), True) mention_head = html_escape( " ".join(mention.attributes['head']), True) mention_type = html_escape("".join(mention.attributes['type']), True) mention_span = str(mention.span) if mention in annotated_mentions and \ mention not in processed_gold_mentions: system = "gold" processed_gold_mentions.add(mention) else: system = "system" chain_id = system + str(mention.attributes['annotated_set_id']) if chain_id not in chains: self.navi[system] += "\n\t\t\t\t\t\t<li class=\"" + \ chain_id +\ "\">" + mention_tokens + "</li>" chains.add(chain_id) if chain_id not in self.chain_to_colour.keys(): while True: # r = lambda: randint(170, 255) # colour = '#%02X%02X%02X' % (r(), r(), r()) colour = self.colours_panel[self.colour_id] self.colour_id += 1 if self.colour_id == len(self.colours_panel): self.colour_id = 0 # if colour not in self.colours: # self.colours.append(colour) # break break self.chain_to_colour[chain_id] = colour span_id = document.get_html_friendly_identifier() + "_" + \ str(mention_id) temp_text = "<span " \ "id=\"" + span_id + "\" " \ "class=\"" + chain_id + " mention\" " \ "data-mentiontype=\"" + mention_type + "\" " \ "data-mentionhead=\"" + mention_head + "\" " \ "data-span=\"" + mention_span + "\">" if mention.span.begin == index and mention.span.end == index: if mention_text.endswith("</span> "): mention_text = temp_text + mention_text.strip() + \ "</span> " elif mention_text == "": mention_text = temp_text + token + "</span> " elif mention.span.begin == index: if mention_text == "": mention_text = temp_text + token + " " else: mention_text = temp_text + mention_text elif mention.span.end == index: if mention_text == "": mention_text = token + "</span> " else: mention_text = mention_text.strip() + "</span> " mention_id += 1 if mention_text == "": mention_text = token + " " token_span = spans.Span(index, index) if document.get_sentence_id_and_span(token_span) is None or \ sentence_span != document.get_sentence_id_and_span( token_span)[1]: mention_text = "</li>\n" \ "\t\t\t\t<li class=\"sentence\">" + mention_text sentence_id, sentence_span = document.get_sentence_id_and_span( token_span) document_html += mention_text index += 1 document_html.strip() return document_html + "</li>\n\t\t\t</ol>"
def from_document(span, document, first_in_gold_entity=False): """ Create a mention from a span in a document. All attributes of the mention are computed from the linguistic information found in the document. For information about the attributes, see the class documentation. Args: document (CoNLLDocument): The document the mention belongs to. span (Span): The span of the mention in the document. Returns: Mention: A mention extracted from the input span in the input document. """ i, sentence_span = document.get_sentence_id_and_span(span) attributes = { "tokens": document.tokens[span.begin:span.end + 1], "pos": document.pos[span.begin:span.end + 1], "ner": document.ner[span.begin:span.end + 1], "sentence_id": i, "parse_tree": mention_property_computer.get_relevant_subtree( span, document), "speaker": document.speakers[span.begin], "antecedent": None, "set_id": None, "first_in_gold_entity": first_in_gold_entity } if span in document.coref: attributes["annotated_set_id"] = document.coref[span] else: attributes["annotated_set_id"] = None attributes["is_apposition"] = \ mention_property_computer.is_apposition(attributes) attributes["grammatical_function"] = \ mention_property_computer.get_grammatical_function(attributes) (head, in_mention_span, head_index) = \ mention_property_computer.compute_head_information(attributes) attributes["head"] = head attributes["head_span"] = spans.Span( span.begin + in_mention_span.begin, span.begin + in_mention_span.end ) attributes["head_index"] = head_index attributes["type"] = mention_property_computer.get_type(attributes) attributes["fine_type"] = mention_property_computer.get_fine_type( attributes) if attributes["type"] == "PRO": attributes["citation_form"] = \ mention_property_computer.get_citation_form( attributes) attributes["number"] = \ mention_property_computer.compute_number(attributes) attributes["gender"] = \ mention_property_computer.compute_gender(attributes) attributes["semantic_class"] = \ mention_property_computer.compute_semantic_class(attributes) attributes["head_as_lowercase_string"] = " ".join(attributes[ "head"]).lower() attributes["tokens_as_lowercase_string"] = " ".join(attributes[ "tokens"]).lower() dep_tree = document.dep[i] index = span.begin + head_index - sentence_span.begin governor_id = dep_tree[index].head - 1 if governor_id == -1: attributes["governor"] = "NONE" else: attributes["governor"] = dep_tree[governor_id].form.lower() attributes["ancestry"] = Mention._get_ancestry(dep_tree, index) attributes["deprel"] = dep_tree[index].deprel return Mention(document, span, attributes)
def __generate_html_for_raw(self, document, mentions): document_html = "\n\t\t\t<ol class=\"text\">\n" \ "\t\t\t\t<li class=\"sentence\">" self.navi["gold"] = "\n\t\t\t\t<div class=\"goldNavi\">" \ "<h3>Reference Entities</h3>" \ "<span class=\"tease\">show all</span>" \ "\n\t\t\t\t\t<ul>" self.navi["system"] = "\n\t\t\t\t<div class=\"systemNavi\">" \ "<h3>System Entities</h3>" \ "<span class=\"tease\">show all</span>" \ "\n\t\t\t\t\t<ul>" chains = set() index = 0 sentence_id, sentence_span = document.get_sentence_id_and_span( spans.Span(0, 0)) annotated_mentions = set(document.annotated_mentions) temp_navi = {"gold": {}, "system": {}} chain_counter = { "gold": collections.Counter(), "system": collections.Counter() } for system in ["gold", "system"]: for mention in annotated_mentions: chain_counter[system].update( [system + str(mention.attributes["annotated_set_id"])]) for token in document.tokens: token = html_escape(token, True) mention_id = 0 mention_text = "" processed_gold_mentions = set() for mention in mentions: if mention.span.begin > index: break if mention.span.end < index: mention_id += 1 continue mention_tokens = html_escape( " ".join(mention.attributes['tokens']), True) mention_head = html_escape( " ".join(mention.attributes['head']), True) mention_type = html_escape("".join(mention.attributes['type']), True) mention_span = str(mention.span) system = "system" chain_id = system + str(mention.attributes['annotated_set_id']) if chain_id not in chains: temp_navi[system][chain_id] = "\n\t\t\t\t\t\t<li " \ "class=\"" + \ chain_id +\ "\">" + mention_tokens + "</li>" chains.add(chain_id) if chain_id not in self.chain_to_colour.keys(): while True: r = lambda: randint(170, 255) colour = '#%02X%02X%02X' % (r(), r(), r()) if colour not in self.colours: self.colours.append(colour) break self.chain_to_colour[chain_id] = colour span_id = document.get_html_friendly_identifier() + "_" + \ str(mention_id) style = "" if chain_counter[system][chain_id] > 1: style = "style=\"background-color:" + self.chain_to_colour[ chain_id] + "\" " temp_text = "<span " \ "id=\"" + span_id + "\" " \ "class=\"" + chain_id + " mention\" " + \ style + \ "data-mentiontype=\"" + mention_type + "\" " \ "data-mentionhead=\"" + mention_head + "\" " \ "data-span=\"" + mention_span + "\">" if mention.span.begin == index and mention.span.end == index: if mention_text.endswith("</span> "): mention_text = temp_text + mention_text.strip() + \ "</span> " elif mention_text == "": mention_text = temp_text + token + "</span> " elif mention.span.begin == index: if mention_text == "": mention_text = temp_text + token + " " else: mention_text = temp_text + mention_text elif mention.span.end == index: if mention_text == "": mention_text = token + "</span> " else: mention_text = mention_text.strip() + "</span> " mention_id += 1 if mention_text == "": mention_text = token + " " token_span = spans.Span(index, index) if document.get_sentence_id_and_span(token_span) is None or \ sentence_span != document.get_sentence_id_and_span( token_span)[1]: mention_text = "</li>\n" \ "\t\t\t\t<li class=\"sentence\">" + mention_text sentence_id, sentence_span = document.get_sentence_id_and_span( token_span) document_html += mention_text index += 1 document_html.strip() for system in ["system"]: for key, val in chain_counter[system].items(): if val > 1: self.navi[system] += temp_navi[system][key] return document_html + "</li>\n\t\t\t</ol>"
def adjust_head_for_nam(tokens, pos, ner_type, in_mention_span_old_head, old_head): """ Adjust head for proper names via heuristics. Based on heuristics depending on the named entity type (person, organization, ...) and part-of-speech tags, adjust the head of a named entity mention to a meaningful extent useful for coreference resolution. For example, for the mention "Khan Younes in Southern Gaza Strip", this function will compute "Khan Younes" as the head. Args: tokens (list(str)): The tokens of the mention. pos (list(str)): The part-of-speech tags of the mention. ner_type (str): The named entity type of the mention. Should be one of PERSON, ORG, GPE, FAC, NORP, PRODUCT, EVENT, MONEY, WORK_OF_ART, LOC, LAW, LANGUAGE, DATE, TIME, ORDINAL, CARDINAL, QUANTITY, PERCENT or NONE. in_mention_span_old_head (spans.Span): The in-mention span of the old head. old_head (list(str)): The tokens of the old head. Returns: (Span, list(str)): The in-mention span of the adjusted head and the tokens of the adjusted head. """ # TODO: get rid of this ugly hack if len(pos) == 0: return spans.Span(0, 0), "NOHEAD" stop_regex = re.compile("CC|,|\.|:|;|V.*|IN|W.*|ADVP|NN$") if re.match( "ORG.*|GPE.*|FAC.*|NORP.*|PRODUCT|EVENT|MONEY|" + "WORK_OF_ART|LOC.*|LAW|LANGUAGE", ner_type): start_regex = re.compile("NN(S)?|NNP(S)?") stop_regex = re.compile("V.*|IN|W.*|ADVP|,|-LRB-") elif ner_type == "PERSON": start_regex = re.compile("NN(S)?|NNP(S)?") stop_regex = re.compile("IN|CC|,|\.|:|;|V.*|W.*|-LRB-") elif re.match("DATE|TIME", ner_type): start_regex = re.compile("NN(S)?|NNP(S)?|CD") elif re.match("ORDINAL", ner_type): start_regex = re.compile("NN|JJ|RB") elif re.match("CARDINAL", ner_type): start_regex = re.compile("CD") elif re.match("QUANTITY|PERCENT", ner_type): start_regex = re.compile("CD|JJ|NN") elif ner_type == "NONE": start_regex = re.compile("NN(S)?|NNP(S)?|CD") else: logger.warning("No head adjustment rule defined for NER class " + ner_type + ".") return in_mention_span_old_head, old_head head_start = -1 position = 0 for i in range(0, len(tokens)): position = i if head_start == -1 and start_regex.match(pos[i]): head_start = i elif head_start >= 0 and stop_regex.match(pos[i]): return spans.Span(head_start, i - 1), tokens[head_start:i] if head_start == -1: head_start = 0 if pos[position] == "POS" and position == len(pos) - 1: position -= 1 return spans.Span(head_start, position), tokens[head_start:position + 1]
def test_post_process_same_head_largest_span(self): all_mentions = { mentions.Mention( None, spans.Span(0, 3), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(3, 3) }), mentions.Mention( None, spans.Span(0, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(3, 3) }), mentions.Mention( None, spans.Span(0, 2), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(1, 1) }), mentions.Mention( None, spans.Span(5, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(5, 6) }), mentions.Mention( None, spans.Span(0, 0), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(0, 0) }) } expected_mentions = sorted([ mentions.Mention( None, spans.Span(0, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(3, 3) }), mentions.Mention( None, spans.Span(0, 2), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(1, 1) }), mentions.Mention( None, spans.Span(5, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(5, 6) }), mentions.Mention( None, spans.Span(0, 0), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(0, 0) }) ]) self.assertEqual( expected_mentions, mention_extractor.post_process_same_head_largest_span( all_mentions)) all_mentions_2 = { mentions.Mention( None, spans.Span(0, 1), { "tokens": ["Taiwan", "'s"], "type": "NAM", "head_index": 0, "head_span": spans.Span(0, 0) }), mentions.Mention( None, spans.Span(0, 0), { "tokens": ["Taiwan"], "type": "NAM", "head_index": 0, "head_span": spans.Span(0, 0) }), mentions.Mention( None, spans.Span(2, 3), { "tokens": ["the", "CCP"], "type": "NAM", "head_index": 1, "head_span": spans.Span(3, 3) }), mentions.Mention( None, spans.Span(3, 3), { "tokens": ["CCP"], "type": "NAM", "head_index": 0, "head_span": spans.Span(3, 3) }) } expected_mentions_2 = sorted([ mentions.Mention( None, spans.Span(0, 1), { "tokens": ["Taiwan", "'s"], "type": "NAM", "head_index": 0, "head_span": spans.Span(0, 0) }), mentions.Mention( None, spans.Span(2, 3), { "tokens": ["the", "CCP"], "type": "NAM", "head_index": 1, "head_span": spans.Span(3, 3) }), ]) self.assertEqual( expected_mentions_2, mention_extractor.post_process_same_head_largest_span( all_mentions_2))
def compute_head_information(attributes): """ Compute the head of the mention. Args: attributes (dict(str, object)): Attributes of the mention, must contain values for "tokens", "parse_tree", "pos", "ner", "is_apposition" Returns: (list(str), Span, int): The head, the head span (in the document) and the starting index of the head (in the mention). """ mention_subtree = attributes["parse_tree"] head_finder = head_finders.HeadFinder() head_index = 0 head = [attributes["tokens"][0]] if len(mention_subtree.leaves()) == len(attributes["tokens"]): head_tree = head_finder.get_head(mention_subtree) head_index = get_head_index(head_tree, mention_subtree.pos()) head = [head_tree[0]] in_mention_span = spans.Span(head_index, head_index) if attributes["pos"][head_index].startswith("NNP"): in_mention_span, head = \ head_finders.HeadFinder.adjust_head_for_nam( attributes["tokens"], attributes["pos"], attributes["ner"][head_index]) # proper name mention: head index last word of head # (e.g. "Obama" in "Barack Obama") head_index = in_mention_span.end # special handling for appositions if attributes["is_apposition"]: # "Secretary of State Madeleine Albright" # => take "Madeleine Albright" as head if len(mention_subtree) == 2: head_tree = mention_subtree[1] head = head_tree.leaves() in_mention_span = spans.Span(len(mention_subtree[0].leaves()), len(attributes["tokens"]) - 1) head_index = in_mention_span.end else: start = 0 for child in mention_subtree: if __head_pos_starts_with(child, "NNP"): end = min([ start + len(child.leaves()), len(attributes["tokens"]) ]) head_index = end - 1 in_mention_span, head = \ head_finders.HeadFinder.adjust_head_for_nam( attributes["tokens"][start:end], attributes["pos"][start:end], attributes["ner"][head_index]) break start += len(child.leaves()) return head, in_mention_span, head_index
def setUp(self): self.gold_first_cluster = [ mentions.Mention(None, spans.Span(0, 0), { "tokens": ["a"], "type": "NOM", "annotated_set_id": 0 }), mentions.Mention(None, spans.Span(1, 1), { "tokens": ["US"], "type": "NAM", "annotated_set_id": 0 }), mentions.Mention( None, spans.Span(2, 3), { "tokens": ["angry", "salesman"], "type": "PRO", "annotated_set_id": 0 }), mentions.Mention(None, spans.Span(4, 5), { "tokens": ["the", "rainbow"], "type": "NAM", "annotated_set_id": 0 }), mentions.Mention(None, spans.Span(5, 6), { "tokens": ["and", "far"], "type": "NOM", "annotated_set_id": 0 }), mentions.Mention(None, spans.Span(7, 7), { "tokens": ["neypmd"], "type": "NOM", "annotated_set_id": 0 }), ] self.gold_second_cluster = [ mentions.Mention(None, spans.Span(7, 8), { "type": "NOM", "annotated_set_id": 1 }), mentions.Mention(None, spans.Span(9, 9), { "type": "NAM", "annotated_set_id": 1 }), mentions.Mention(None, spans.Span(10, 10), { "type": "PRO", "annotated_set_id": 1 }), ] self.system1_mentions = [ mentions.Mention(None, spans.Span(0, 0), {"set_id": 0}), mentions.Mention(None, spans.Span(2, 3), {"set_id": 0}), mentions.Mention(None, spans.Span(4, 5), {"set_id": 2}), mentions.Mention(None, spans.Span(5, 6), {"set_id": 2}), mentions.Mention(None, spans.Span(3, 4), {"set_id": 1}), mentions.Mention(None, spans.Span(7, 8), {"set_id": 1}), ] self.system2_cluster = [ mentions.Mention(None, spans.Span(0, 0), { "tokens": ["a"], "set_id": 0 }), mentions.Mention(None, spans.Span(2, 3), { "tokens": ["angry", "salesman"], "set_id": 0 }), mentions.Mention(None, spans.Span(7, 8), { "tokens": ["snafu", "foo"], "set_id": 0 }), mentions.Mention(None, spans.Span(9, 9), { "tokens": ["bar"], "set_id": 0 }), ] self.system2_cluster[1].attributes["antecedent"] = \ self.system2_cluster[0] self.system2_cluster[2].attributes["antecedent"] = \ self.system2_cluster[0] self.system2_cluster[3].attributes["antecedent"] = \ self.system2_cluster[2] self.maxDiff = None
def run_on_doc(self, doc_file, name=None): if self.with_coref: soup = bs4.BeautifulSoup(doc_file.read()) preprocessed = self.proc.parse_doc(soup.text) else: data = doc_file.read() preprocessed = self.proc.parse_doc(data) sentences = [] for sentence in preprocessed["sentences"]: processed_ner = [] for ner in sentence["ner"]: if ner == "O" or ner == "MISC": processed_ner.append("NONE") else: processed_ner.append(ner) processed_dep = [] index_to_dep_info = {} for dep_info in sentence["deps_basic"]: label, head, in_sent_index = dep_info index_to_dep_info[in_sent_index] = label, head for i in range(0, len(sentence["tokens"])): if i in index_to_dep_info.keys(): label, head = index_to_dep_info[i] processed_dep.append( CoNLL.Token( form=sentence["tokens"][i], lemma=sentence["lemmas"][i], pos=sentence["pos"][i], index=i+1, head=head+1, deprel=label, cpos=None, feats=None, phead=None, pdeprel=None, extra=None ) ) else: processed_dep.append( CoNLL.Token( form=sentence["tokens"][i], lemma=sentence["lemmas"][i], pos=sentence["pos"][i], index=i+1, head=0, deprel="punc", cpos=None, feats=None, phead=None, pdeprel=None, extra=None ) ) sentences.append( (sentence["tokens"], sentence["pos"], processed_ner, ["-"]*len(sentence["tokens"]), sentence["parse"], processed_dep, ) ) if not name: name = doc_file.name if self.with_coref: antecedent_decisions = {} coref = {} mention_id_to_spans = {} max_entity = 0 for mention in soup.findAll("mention"): if mention.get("entity"): max_entity = max(max_entity, int(mention.get("entity"))) for mention in soup.findAll("mention"): mention_id = int(mention.get("id")) span = spans.Span(int(mention.get("span_start")), int(mention.get("span_end"))) mention_id_to_spans[mention_id] = span if mention.get("entity"): annotated_set_id = int(mention.get("entity")) else: annotated_set_id = max_entity + 1 + mention_id coref[span] = annotated_set_id if mention.get("antecedent"): antecedent_decisions[span] = mention_id_to_spans[ int(mention.get("antecedent")) ] doc = documents.Document( name, sentences, coref) spans_to_annotated_mentions = {} for mention in doc.annotated_mentions: spans_to_annotated_mentions[mention.span] = mention for span in antecedent_decisions: ante_span = antecedent_decisions[span] ana = spans_to_annotated_mentions[span] ante = spans_to_annotated_mentions[ante_span] ana.attributes["antecedent"] = ante else: doc = documents.Document( name, sentences, {}) return doc
def test_post_process_embedded_head_largest_span(self): all_mentions_1 = { mentions.Mention( None, spans.Span(0, 3), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(3, 3) }), mentions.Mention( None, spans.Span(0, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(2, 3) }), mentions.Mention( None, spans.Span(0, 2), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(1, 1) }), mentions.Mention( None, spans.Span(5, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(5, 6) }) } expected_mentions_1 = sorted([ mentions.Mention( None, spans.Span(0, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(2, 3) }), mentions.Mention( None, spans.Span(0, 2), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(1, 1) }), mentions.Mention( None, spans.Span(5, 6), { "tokens": [], "type": "NOM", "head_index": 0, "head_span": spans.Span(5, 6) }) ]) self.assertEqual( expected_mentions_1, mention_extractor.post_process_embedded_head_largest_span( all_mentions_1))
def setUp(self): self.first_cluster = [ mentions.Mention( None, spans.Span(0, 0), {"tokens": ["a"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(1, 1), {"tokens": ["b"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(2, 3), {"tokens": ["c", "d"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(4, 5), {"tokens": ["e", "f"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(5, 6), {"tokens": ["f", "g"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(7, 7), {"tokens": ["h"], "annotated_set_id": 0}), ] self.second_cluster = [ mentions.Mention( None, spans.Span(3, 4), {"tokens": ["d", "e"], "annotated_set_id": 1}), mentions.Mention( None, spans.Span(7, 8), {"tokens": ["h", "i"], "annotated_set_id": 1}), mentions.Mention( None, spans.Span(10, 10), {"tokens": ["k"], "annotated_set_id": 1}) ] self.system_cluster = [ mentions.Mention( None, spans.Span(0, 0), {"tokens": ["a"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(2, 3), {"tokens": ["c", "d"], "annotated_set_id": 0}), mentions.Mention( None, spans.Span(4, 5), {"tokens": ["e", "f"], "annotated_set_id": 2}), mentions.Mention( None, spans.Span(5, 6), {"tokens": ["f", "g"], "annotated_set_id": 2}), mentions.Mention( None, spans.Span(7, 7), {"tokens": ["h"], "annotated_set_id": 1}), mentions.Mention( None, spans.Span(10, 10), {"tokens": ["k"], "annotated_set_id": 1}) ] self.maxDiff = None
def test_extract_system_mentions(self): expected_spans = sorted([ spans.Span(0, 1), spans.Span(0, 5), spans.Span(3, 5), spans.Span(5, 5), spans.Span(8, 10), spans.Span(8, 11), spans.Span(13, 16), spans.Span(13, 20), spans.Span(14, 14), spans.Span(18, 20), spans.Span(22, 23), spans.Span(25, 25), spans.Span(33, 34) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.real_document, filter_mentions=False)[1:] ]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 7), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=False)[1:] ]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=True)[1:] ])
def test_post_process_appositions(self): three_children_tree = nltk.ParentedTree.fromstring( "(NP (NP (NP (NP (DT The) (NNP ROC) (POS 's)) (NN ambassador)) " "(PP (IN to) (NP (NNP Nicaragua)))) (, ,) (NP (NNP Antonio) " "(NNP Tsai)) (, ,))") three_children_all_mentions = { mentions.Mention( None, spans.Span(0, 6), { "tokens": [ "The", "ROC", "'s", "ambassador", "to", "Nicaragua", ",", "Antonio", "Tsai" ], "is_apposition": True, "type": "NAM", "parse_tree": three_children_tree }), mentions.Mention( None, spans.Span(0, 4), { "tokens": ["The", "ROC", "'s", "ambassador", "to", "Nicaragua"], "is_apposition": False, "type": "NOM", "parse_tree": three_children_tree[0] }), mentions.Mention( None, spans.Span(0, 3), { "tokens": ["The", "ROC", "'s", "ambassador"], "is_apposition": False, "type": "NOM", "parse_tree": three_children_tree[0][0] }), mentions.Mention( None, spans.Span(0, 2), { "tokens": ["The", "ROC", "'s"], "is_apposition": False, "type": "NAM", "parse_tree": three_children_tree[0][0][0] }), mentions.Mention( None, spans.Span(4, 4), { "tokens": ["Nicaragua"], "is_apposition": False, "type": "NAM", "parse_tree": three_children_tree[0][1][1] }), mentions.Mention( None, spans.Span(5, 6), { "tokens": ["Antonio", "Tsai"], "is_apposition": False, "type": "NAM", "parse_tree": three_children_tree[2] }) } three_children_expected = sorted([ mentions.Mention( None, spans.Span(0, 6), { "tokens": [ "The", "ROC", "'s", "ambassador", "to", "Nicaragua", ",", "Antonio", "Tsai" ], "is_apposition": True, "type": "NAM", "parse_tree": three_children_tree }), mentions.Mention( None, spans.Span(0, 3), { "tokens": ["The", "ROC", "'s", "ambassador"], "is_apposition": False, "type": "NOM", "parse_tree": three_children_tree[0][0] }), mentions.Mention( None, spans.Span(0, 2), { "tokens": ["The", "ROC", "'s"], "is_apposition": False, "type": "NAM", "parse_tree": three_children_tree[0][0][0] }), mentions.Mention( None, spans.Span(4, 4), { "tokens": ["Nicaragua"], "is_apposition": False, "type": "NAM", "parse_tree": three_children_tree[0][1][1] }), ]) self.assertEqual( three_children_expected, mention_extractor.post_process_appositions( three_children_all_mentions)) two_children_tree = nltk.ParentedTree.fromstring( "(NP (NP (NP (NNP Secretary)) (PP (IN of) (NP (NNP State)))) " "(NP (NNP Madeleine) (NNP Albright)))") two_children_all_mentions = { mentions.Mention( None, spans.Span(0, 4), { "tokens": ["Secretary", "of", "Sate", "Madeleine", "Albright"], "is_apposition": True, "type": "NAM", "parse_tree": two_children_tree }), mentions.Mention( None, spans.Span(0, 0), { "tokens": ["Secretary"], "is_apposition": False, "type": "NAM", "parse_tree": two_children_tree[0][0] }), mentions.Mention( None, spans.Span(0, 2), { "tokens": ["Secretary", "of", "State"], "is_apposition": False, "type": "NAM", "parse_tree": two_children_tree[0] }), mentions.Mention( None, spans.Span(2, 2), { "tokens": ["State"], "is_apposition": False, "type": "NAM", "parse_tree": two_children_tree[0][1][1] }), mentions.Mention( None, spans.Span(2, 2), { "tokens": ["Madeleine", "Albright"], "is_apposition": False, "type": "NAM", "parse_tree": two_children_tree[1] }) } two_children_expected = sorted([ mentions.Mention( None, spans.Span(0, 4), { "tokens": ["Secretary", "of", "Sate", "Madeleine", "Albright"], "is_apposition": True, "type": "NAM", "parse_tree": two_children_tree }) ]) self.assertEqual( two_children_expected, mention_extractor.post_process_appositions( two_children_all_mentions))