def test_get_head_index(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ real_document = CoNLLDocument(self.real_example) expected = 0 head = nltk.ParentedTree.fromstring("(WHNP (WP who))") mention_subtree = mention_property_computer.get_relevant_subtree( Span(29, 34), real_document) self.assertEqual( expected, mention_property_computer.get_head_index(head, mention_subtree))
def test_get_head_index(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ real_document = CoNLLDocument(self.real_example) expected = 0 head = nltk_util.parse_parented_tree("(WHNP (WP who))") mention_subtree = mention_property_computer.get_relevant_subtree( Span(29, 34), real_document) self.assertEqual(expected, mention_property_computer.get_head_index( head, mention_subtree))
def from_document(span, document, first_in_gold_entity=False): """ Create a mention from a span in a document. All attributes of the mention are computed from the linguistic information found in the document. For information about the attributes, see the class documentation. Args: document (CoNLLDocument): The document the mention belongs to. span (Span): The span of the mention in the document. Returns: Mention: A mention extracted from the input span in the input document. """ i, sentence_span = document.get_sentence_id_and_span(span) attributes = { "tokens": document.tokens[span.begin:span.end + 1], "pos": document.pos[span.begin:span.end + 1], "ner": document.ner[span.begin:span.end + 1], "sentence_id": i, "parse_tree": mention_property_computer.get_relevant_subtree( span, document), "speaker": document.speakers[span.begin], "antecedent": None, "set_id": None, "first_in_gold_entity": first_in_gold_entity } if span in document.coref: attributes["annotated_set_id"] = document.coref[span] else: attributes["annotated_set_id"] = None attributes["is_apposition"] = \ mention_property_computer.is_apposition(attributes) attributes["grammatical_function"] = \ mention_property_computer.get_grammatical_function(attributes) (head, in_mention_span, head_index) = \ mention_property_computer.compute_head_information(attributes) attributes["head"] = head attributes["head_span"] = spans.Span( span.begin + in_mention_span.begin, span.begin + in_mention_span.end ) attributes["head_index"] = head_index attributes["type"] = mention_property_computer.get_type(attributes) attributes["fine_type"] = mention_property_computer.get_fine_type( attributes) if attributes["type"] == "PRO": attributes["citation_form"] = \ mention_property_computer.get_citation_form( attributes) attributes["number"] = \ mention_property_computer.compute_number(attributes) attributes["gender"] = \ mention_property_computer.compute_gender(attributes) attributes["semantic_class"] = \ mention_property_computer.compute_semantic_class(attributes) attributes["head_as_lowercase_string"] = " ".join(attributes[ "head"]).lower() attributes["tokens_as_lowercase_string"] = " ".join(attributes[ "tokens"]).lower() dep_tree = document.dep[i] index = span.begin + head_index - sentence_span.begin governor_id = dep_tree[index].head - 1 if governor_id == -1: attributes["governor"] = "NONE" else: attributes["governor"] = dep_tree[governor_id].form.lower() attributes["ancestry"] = Mention._get_ancestry(dep_tree, index) attributes["deprel"] = dep_tree[index].deprel return Mention(document, span, attributes)