def test_get_head_index(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ real_document = CoNLLDocument(self.real_example) expected = 0 head = nltk.ParentedTree.fromstring("(WHNP (WP who))") mention_subtree = mention_property_computer.get_relevant_subtree( Span(29, 34), real_document) self.assertEqual( expected, mention_property_computer.get_head_index(head, mention_subtree))
def setUp(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ self.complicated_mention_example = """#begin document (/test2); part 000 test2 0 0 This NN (NP* - - - - - (0) test2 0 1 is NN * - - - - - - test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - (0|(1) test2 0 4 test NN * - - - - - 0) test2 0 5 . NN *) - - - - - - test2 0 0 It NN (NP* - - - - - (1)|(4 test2 0 1 shows NN * - - - - - - test2 0 2 that NN * - - - - - (2) test2 0 3 the NN * - - - - - (2|(0 test2 0 4 scorer NN * - - - - - 2)|4) test2 0 5 works NN * - - - - - 0) test2 0 6 . NN *) - - - - - - #end document """ self.another_real_example = """#begin document (mz/sinorama/10/ectb_1050); part 006 mz/sinorama/10/ectb_1050 6 0 What WP (TOP(SBARQ(WHNP*) - - - - * (R-ARG1*) - mz/sinorama/10/ectb_1050 6 1 does VBZ (SQ* do - 7 - * * - mz/sinorama/10/ectb_1050 6 2 this DT (NP*) - - - - * (ARG0*) - mz/sinorama/10/ectb_1050 6 3 tell VB (VP* tell 01 1 - * (V*) - mz/sinorama/10/ectb_1050 6 4 us PRP (NP*) - - - - * (ARG2*) - mz/sinorama/10/ectb_1050 6 5 about IN (PP* - - - - * (ARG1* - mz/sinorama/10/ectb_1050 6 6 the DT (NP(NP* - - - - * * - mz/sinorama/10/ectb_1050 6 7 transformation NN *) transformation - 1 - * * - mz/sinorama/10/ectb_1050 6 8 of IN (PP* - - - - * * - mz/sinorama/10/ectb_1050 6 9 Taiwan NNP (NP(NP* - - - - (GPE) * - mz/sinorama/10/ectb_1050 6 10 's POS *) - - - - * * - mz/sinorama/10/ectb_1050 6 11 townships NNS *)))))) township - 1 - * *) - mz/sinorama/10/ectb_1050 6 12 ? . *)) - - - - * * - #end document """ self.yemen_example = """#begin document (bn/abc/00/abc_0030); part 000 bn/abc/00/abc_0030 0 0 Intelligence NN (TOP(S(NP* - - - - * (ARG0* * - bn/abc/00/abc_0030 0 1 sources NNS *) source - 3 - * *) * - bn/abc/00/abc_0030 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/abc/00/abc_0030 0 3 the DT (SBAR(S(NP* - - - - * (ARG1* (ARG1* - bn/abc/00/abc_0030 0 4 target NN *) target - 2 - * * *) - bn/abc/00/abc_0030 0 5 was VBD (VP* be - 1 - * * * - bn/abc/00/abc_0030 0 6 to TO (S(VP* - - - - * * * - bn/abc/00/abc_0030 0 7 be VB (VP* be 01 1 - * * (V*) - bn/abc/00/abc_0030 0 8 a DT (NP(NP* - - - - * * * - bn/abc/00/abc_0030 0 9 destroyer NN *) - - - - * * * - bn/abc/00/abc_0030 0 10 , , * - - - - * * * - bn/abc/00/abc_0030 0 11 the DT (NP* - - - - * * * - bn/abc/00/abc_0030 0 12 `` `` * - - - - * * * - bn/abc/00/abc_0030 0 13 USS NNP (NP* - - - - (PRODUCT* * * - bn/abc/00/abc_0030 0 14 The NNP * - - - - * * * - bn/abc/00/abc_0030 0 15 Sullivans NNP *) - - - - *) * * - bn/abc/00/abc_0030 0 16 , , * - - - - * * * - bn/abc/00/abc_0030 0 17 '' '' * - - - - * * * - bn/abc/00/abc_0030 0 18 which WDT (SBAR(WHNP*) - - - - * * * - bn/abc/00/abc_0030 0 19 refueled VBD (S(VP* refuel - - - * * * - bn/abc/00/abc_0030 0 20 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 21 Yemen NNP (NP(NP(NP* - - - - (GPE) * * - bn/abc/00/abc_0030 0 22 's POS *) - - - - * * * - bn/abc/00/abc_0030 0 23 port NN *) port - 1 - (LOC* * * - bn/abc/00/abc_0030 0 24 of IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 25 Aden NNP (NP*)))) - - - - *) * * - bn/abc/00/abc_0030 0 26 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 27 January NNP (NP*)))))))))))))) - - - - (DATE) *) * - bn/abc/00/abc_0030 0 28 . . *)) - - - - * * * - #end document """ self.real_document = CoNLLDocument(self.real_example) self.complicated_mention_document = CoNLLDocument( self.complicated_mention_example) self.another_real_document = CoNLLDocument(self.another_real_example) self.yemen_document = CoNLLDocument(self.yemen_example) self.maxDiff = None
class TestDocuments(unittest.TestCase): def setUp(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ self.complicated_mention_example = """#begin document (/test2); part 000 test2 0 0 This NN (NP* - - - - - (0) test2 0 1 is NN * - - - - - - test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - (0|(1) test2 0 4 test NN * - - - - - 0) test2 0 5 . NN *) - - - - - - test2 0 0 It NN (NP* - - - - - (1)|(4 test2 0 1 shows NN * - - - - - - test2 0 2 that NN * - - - - - (2) test2 0 3 the NN * - - - - - (2|(0 test2 0 4 scorer NN * - - - - - 2)|4) test2 0 5 works NN * - - - - - 0) test2 0 6 . NN *) - - - - - - #end document """ self.another_real_example = """#begin document (mz/sinorama/10/ectb_1050); part 006 mz/sinorama/10/ectb_1050 6 0 What WP (TOP(SBARQ(WHNP*) - - - - * (R-ARG1*) - mz/sinorama/10/ectb_1050 6 1 does VBZ (SQ* do - 7 - * * - mz/sinorama/10/ectb_1050 6 2 this DT (NP*) - - - - * (ARG0*) - mz/sinorama/10/ectb_1050 6 3 tell VB (VP* tell 01 1 - * (V*) - mz/sinorama/10/ectb_1050 6 4 us PRP (NP*) - - - - * (ARG2*) - mz/sinorama/10/ectb_1050 6 5 about IN (PP* - - - - * (ARG1* - mz/sinorama/10/ectb_1050 6 6 the DT (NP(NP* - - - - * * - mz/sinorama/10/ectb_1050 6 7 transformation NN *) transformation - 1 - * * - mz/sinorama/10/ectb_1050 6 8 of IN (PP* - - - - * * - mz/sinorama/10/ectb_1050 6 9 Taiwan NNP (NP(NP* - - - - (GPE) * - mz/sinorama/10/ectb_1050 6 10 's POS *) - - - - * * - mz/sinorama/10/ectb_1050 6 11 townships NNS *)))))) township - 1 - * *) - mz/sinorama/10/ectb_1050 6 12 ? . *)) - - - - * * - #end document """ self.yemen_example = """#begin document (bn/abc/00/abc_0030); part 000 bn/abc/00/abc_0030 0 0 Intelligence NN (TOP(S(NP* - - - - * (ARG0* * - bn/abc/00/abc_0030 0 1 sources NNS *) source - 3 - * *) * - bn/abc/00/abc_0030 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/abc/00/abc_0030 0 3 the DT (SBAR(S(NP* - - - - * (ARG1* (ARG1* - bn/abc/00/abc_0030 0 4 target NN *) target - 2 - * * *) - bn/abc/00/abc_0030 0 5 was VBD (VP* be - 1 - * * * - bn/abc/00/abc_0030 0 6 to TO (S(VP* - - - - * * * - bn/abc/00/abc_0030 0 7 be VB (VP* be 01 1 - * * (V*) - bn/abc/00/abc_0030 0 8 a DT (NP(NP* - - - - * * * - bn/abc/00/abc_0030 0 9 destroyer NN *) - - - - * * * - bn/abc/00/abc_0030 0 10 , , * - - - - * * * - bn/abc/00/abc_0030 0 11 the DT (NP* - - - - * * * - bn/abc/00/abc_0030 0 12 `` `` * - - - - * * * - bn/abc/00/abc_0030 0 13 USS NNP (NP* - - - - (PRODUCT* * * - bn/abc/00/abc_0030 0 14 The NNP * - - - - * * * - bn/abc/00/abc_0030 0 15 Sullivans NNP *) - - - - *) * * - bn/abc/00/abc_0030 0 16 , , * - - - - * * * - bn/abc/00/abc_0030 0 17 '' '' * - - - - * * * - bn/abc/00/abc_0030 0 18 which WDT (SBAR(WHNP*) - - - - * * * - bn/abc/00/abc_0030 0 19 refueled VBD (S(VP* refuel - - - * * * - bn/abc/00/abc_0030 0 20 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 21 Yemen NNP (NP(NP(NP* - - - - (GPE) * * - bn/abc/00/abc_0030 0 22 's POS *) - - - - * * * - bn/abc/00/abc_0030 0 23 port NN *) port - 1 - (LOC* * * - bn/abc/00/abc_0030 0 24 of IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 25 Aden NNP (NP*)))) - - - - *) * * - bn/abc/00/abc_0030 0 26 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 27 January NNP (NP*)))))))))))))) - - - - (DATE) *) * - bn/abc/00/abc_0030 0 28 . . *)) - - - - * * * - #end document """ self.real_document = CoNLLDocument(self.real_example) self.complicated_mention_document = CoNLLDocument( self.complicated_mention_example) self.another_real_document = CoNLLDocument(self.another_real_example) self.yemen_document = CoNLLDocument(self.yemen_example) self.maxDiff = None def test_get_identifier(self): self.assertEqual("(bn/voa/02/voa_0220); part 000", self.real_document.identifier) def test_get_tokens(self): tokens = ["Unidentified", "gunmen", "in", "north", "western", "Colombia", "have", "massacred", "at", "least", "twelve", "peasants", "in", "the", "second", "such", "incident", "in", "as", "many", "days", ".", "Local", "police", "say", "it", "'s", "not", "clear", "who", "was", "responsible", "for", "the", "massacre", "."] self.assertEqual(tokens, self.real_document.tokens) def test_get_ner(self): ner = ["NONE"] * 36 ner[5:6] = ["GPE"] ner[8:11] = ["CARDINAL"] * 3 ner[14:15] = ["ORDINAL"] ner[18:21] = ["DATE"] * 3 self.assertEqual(ner, self.real_document.ner) def test_get_coref(self): simple = { Span(13, 20): 0, Span(33, 34): 0 } complicated = { Span(0, 0): 0, Span(3, 3): 1, Span(3, 4): 0, Span(6, 6): 1, Span(6, 10): 4, Span(8, 8): 2, Span(9, 10): 2, Span(9, 11): 0 } self.assertEqual(simple, self.real_document.coref) self.assertEqual(complicated, self.complicated_mention_document.coref) def test_extract_sentence_spans(self): sentence_spans = [Span(0, 21), Span(22, 35)] self.assertEqual(sentence_spans, self.real_document.sentence_spans) def test_get_sentence_id_and_span(self): expected = 1, Span(22, 35) self.assertEqual(expected, self.real_document.get_sentence_id_and_span( Span(23, 24))) def test_parse(self): expected = nltk.ParentedTree.fromstring( "(TOP (S (NP (JJ Local) (NNS police)) (VP (VBP say) " "(SBAR (S (NP (PRP it)) (VP (VBZ 's) (RB not) " "(ADJP (JJ clear)) (SBAR (WHNP (WP who)) (S (VP (VBD was) " "(ADJP (JJ responsible) (PP (IN for) (NP (DT the) (NN " "massacre))))))))))) (. .)))") self.assertEqual(expected, self.real_document.parse[1]) def test_get_string_representation(self): expected = """#begin document (/test2); part 000 test2 0 0 This NN (NP* - - - - - (0|(1) test2 0 1 is NN * - - - - - 0) test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - - test2 0 4 test NN * - - - - - (1 test2 0 5 . NN *) - - - - - (2|1) test2 0 0 It NN (NP* - - - - - 2) test2 0 1 shows NN * - - - - - (3) test2 0 2 that NN * - - - - - (3) test2 0 3 the NN * - - - - - - test2 0 4 scorer NN * - - - - - - test2 0 5 works NN * - - - - - - test2 0 6 . NN *) - - - - - - #end document """ self.complicated_mention_document.system_mentions = [ Mention(self.complicated_mention_document, Span(0, 0), {"set_id": 1}), Mention(self.complicated_mention_document, Span(0, 1), {"set_id": 0}), Mention(self.complicated_mention_document, Span(4, 5), {"set_id": 1}), Mention(self.complicated_mention_document, Span(5, 6), {"set_id": 2}), Mention(self.complicated_mention_document, Span(7, 7), {"set_id": 3}), Mention(self.complicated_mention_document, Span(8, 8), {"set_id": 3}), ] self.assertEqual( expected, self.complicated_mention_document.get_string_representation())
class TestDocuments(unittest.TestCase): def setUp(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ self.complicated_mention_example = """#begin document (/test2); part 000 test2 0 0 This NN (NP* - - - - - (0) test2 0 1 is NN * - - - - - - test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - (0|(1) test2 0 4 test NN * - - - - - 0) test2 0 5 . NN *) - - - - - - test2 0 0 It NN (NP* - - - - - (1)|(4 test2 0 1 shows NN * - - - - - - test2 0 2 that NN * - - - - - (2) test2 0 3 the NN * - - - - - (2|(0 test2 0 4 scorer NN * - - - - - 2)|4) test2 0 5 works NN * - - - - - 0) test2 0 6 . NN *) - - - - - - #end document """ self.another_real_example = """#begin document (mz/sinorama/10/ectb_1050); part 006 mz/sinorama/10/ectb_1050 6 0 What WP (TOP(SBARQ(WHNP*) - - - - * (R-ARG1*) - mz/sinorama/10/ectb_1050 6 1 does VBZ (SQ* do - 7 - * * - mz/sinorama/10/ectb_1050 6 2 this DT (NP*) - - - - * (ARG0*) - mz/sinorama/10/ectb_1050 6 3 tell VB (VP* tell 01 1 - * (V*) - mz/sinorama/10/ectb_1050 6 4 us PRP (NP*) - - - - * (ARG2*) - mz/sinorama/10/ectb_1050 6 5 about IN (PP* - - - - * (ARG1* - mz/sinorama/10/ectb_1050 6 6 the DT (NP(NP* - - - - * * - mz/sinorama/10/ectb_1050 6 7 transformation NN *) transformation - 1 - * * - mz/sinorama/10/ectb_1050 6 8 of IN (PP* - - - - * * - mz/sinorama/10/ectb_1050 6 9 Taiwan NNP (NP(NP* - - - - (GPE) * - mz/sinorama/10/ectb_1050 6 10 's POS *) - - - - * * - mz/sinorama/10/ectb_1050 6 11 townships NNS *)))))) township - 1 - * *) - mz/sinorama/10/ectb_1050 6 12 ? . *)) - - - - * * - #end document """ self.yemen_example = """#begin document (bn/abc/00/abc_0030); part 000 bn/abc/00/abc_0030 0 0 Intelligence NN (TOP(S(NP* - - - - * (ARG0* * - bn/abc/00/abc_0030 0 1 sources NNS *) source - 3 - * *) * - bn/abc/00/abc_0030 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/abc/00/abc_0030 0 3 the DT (SBAR(S(NP* - - - - * (ARG1* (ARG1* - bn/abc/00/abc_0030 0 4 target NN *) target - 2 - * * *) - bn/abc/00/abc_0030 0 5 was VBD (VP* be - 1 - * * * - bn/abc/00/abc_0030 0 6 to TO (S(VP* - - - - * * * - bn/abc/00/abc_0030 0 7 be VB (VP* be 01 1 - * * (V*) - bn/abc/00/abc_0030 0 8 a DT (NP(NP* - - - - * * * - bn/abc/00/abc_0030 0 9 destroyer NN *) - - - - * * * - bn/abc/00/abc_0030 0 10 , , * - - - - * * * - bn/abc/00/abc_0030 0 11 the DT (NP* - - - - * * * - bn/abc/00/abc_0030 0 12 `` `` * - - - - * * * - bn/abc/00/abc_0030 0 13 USS NNP (NP* - - - - (PRODUCT* * * - bn/abc/00/abc_0030 0 14 The NNP * - - - - * * * - bn/abc/00/abc_0030 0 15 Sullivans NNP *) - - - - *) * * - bn/abc/00/abc_0030 0 16 , , * - - - - * * * - bn/abc/00/abc_0030 0 17 '' '' * - - - - * * * - bn/abc/00/abc_0030 0 18 which WDT (SBAR(WHNP*) - - - - * * * - bn/abc/00/abc_0030 0 19 refueled VBD (S(VP* refuel - - - * * * - bn/abc/00/abc_0030 0 20 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 21 Yemen NNP (NP(NP(NP* - - - - (GPE) * * - bn/abc/00/abc_0030 0 22 's POS *) - - - - * * * - bn/abc/00/abc_0030 0 23 port NN *) port - 1 - (LOC* * * - bn/abc/00/abc_0030 0 24 of IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 25 Aden NNP (NP*)))) - - - - *) * * - bn/abc/00/abc_0030 0 26 in IN (PP* - - - - * * * - bn/abc/00/abc_0030 0 27 January NNP (NP*)))))))))))))) - - - - (DATE) *) * - bn/abc/00/abc_0030 0 28 . . *)) - - - - * * * - #end document """ self.real_document = CoNLLDocument(self.real_example) self.complicated_mention_document = CoNLLDocument( self.complicated_mention_example) self.another_real_document = CoNLLDocument(self.another_real_example) self.yemen_document = CoNLLDocument(self.yemen_example) self.maxDiff = None def test_get_identifier(self): self.assertEqual("(bn/voa/02/voa_0220); part 000", self.real_document.identifier) def test_get_tokens(self): tokens = [ "Unidentified", "gunmen", "in", "north", "western", "Colombia", "have", "massacred", "at", "least", "twelve", "peasants", "in", "the", "second", "such", "incident", "in", "as", "many", "days", ".", "Local", "police", "say", "it", "'s", "not", "clear", "who", "was", "responsible", "for", "the", "massacre", "." ] self.assertEqual(tokens, self.real_document.tokens) def test_get_ner(self): ner = ["NONE"] * 36 ner[5:6] = ["GPE"] ner[8:11] = ["CARDINAL"] * 3 ner[14:15] = ["ORDINAL"] ner[18:21] = ["DATE"] * 3 self.assertEqual(ner, self.real_document.ner) def test_get_coref(self): simple = {Span(13, 20): 0, Span(33, 34): 0} complicated = { Span(0, 0): 0, Span(3, 3): 1, Span(3, 4): 0, Span(6, 6): 1, Span(6, 10): 4, Span(8, 8): 2, Span(9, 10): 2, Span(9, 11): 0 } self.assertEqual(simple, self.real_document.coref) self.assertEqual(complicated, self.complicated_mention_document.coref) def test_extract_sentence_spans(self): sentence_spans = [Span(0, 21), Span(22, 35)] self.assertEqual(sentence_spans, self.real_document.sentence_spans) def test_get_sentence_id_and_span(self): expected = 1, Span(22, 35) self.assertEqual( expected, self.real_document.get_sentence_id_and_span(Span(23, 24))) def test_parse(self): expected = nltk.ParentedTree.fromstring( "(TOP (S (NP (JJ Local) (NNS police)) (VP (VBP say) " "(SBAR (S (NP (PRP it)) (VP (VBZ 's) (RB not) " "(ADJP (JJ clear)) (SBAR (WHNP (WP who)) (S (VP (VBD was) " "(ADJP (JJ responsible) (PP (IN for) (NP (DT the) (NN " "massacre))))))))))) (. .)))") self.assertEqual(expected, self.real_document.parse[1]) def test_get_string_representation(self): expected = """#begin document (/test2); part 000 test2 0 0 This NN (NP* - - - - - (0|(1) test2 0 1 is NN * - - - - - 0) test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - - test2 0 4 test NN * - - - - - (1 test2 0 5 . NN *) - - - - - (2|1) test2 0 0 It NN (NP* - - - - - 2) test2 0 1 shows NN * - - - - - (3) test2 0 2 that NN * - - - - - (3) test2 0 3 the NN * - - - - - - test2 0 4 scorer NN * - - - - - - test2 0 5 works NN * - - - - - - test2 0 6 . NN *) - - - - - - #end document """ self.complicated_mention_document.system_mentions = [ Mention(self.complicated_mention_document, Span(0, 0), {"set_id": 1}), Mention(self.complicated_mention_document, Span(0, 1), {"set_id": 0}), Mention(self.complicated_mention_document, Span(4, 5), {"set_id": 1}), Mention(self.complicated_mention_document, Span(5, 6), {"set_id": 2}), Mention(self.complicated_mention_document, Span(7, 7), {"set_id": 3}), Mention(self.complicated_mention_document, Span(8, 8), {"set_id": 3}), ] self.assertEqual( expected, self.complicated_mention_document.get_string_representation())
def setUp(self): self.real_example = """#begin document (bn/voa/02/voa_0220); part 000 bn/voa/02/voa_0220 0 0 Unidentified JJ (TOP(S(NP(NP* - - - - * - bn/voa/02/voa_0220 0 1 gunmen NNS *) - - - - * - bn/voa/02/voa_0220 0 2 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 3 north JJ (NP(ADJP* - - - - * - bn/voa/02/voa_0220 0 4 western JJ *) - - - - * - bn/voa/02/voa_0220 0 5 Colombia NNP *))) - - - - (GPE) - bn/voa/02/voa_0220 0 6 have VBP (VP* have - - - * - bn/voa/02/voa_0220 0 7 massacred VBN (VP* massacre - - - * - bn/voa/02/voa_0220 0 8 at IN (NP(QP(ADVP* - - - - (CARDINAL* - bn/voa/02/voa_0220 0 9 least JJS *) - - - - * - bn/voa/02/voa_0220 0 10 twelve CD *) - - - - *) - bn/voa/02/voa_0220 0 11 peasants NNS *) - - - - * - bn/voa/02/voa_0220 0 12 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 13 the DT (NP(NP* - - - - * (0 bn/voa/02/voa_0220 0 14 second JJ * - - - - (ORDINAL) - bn/voa/02/voa_0220 0 15 such JJ * - - - - * - bn/voa/02/voa_0220 0 16 incident NN *) incident - 2 - * - bn/voa/02/voa_0220 0 17 in IN (PP* - - - - * - bn/voa/02/voa_0220 0 18 as RB (NP(QP* - - - - (DATE* - bn/voa/02/voa_0220 0 19 many JJ *) - - - - * - bn/voa/02/voa_0220 0 20 days NNS *)))))) day - 4 - *) 0) bn/voa/02/voa_0220 0 21 . . *)) - - - - * - bn/voa/02/voa_0220 0 0 Local JJ (TOP(S(NP* - - - - * (ARG0* * - bn/voa/02/voa_0220 0 1 police NNS *) police - - - * *) * - bn/voa/02/voa_0220 0 2 say VBP (VP* say 01 1 - * (V*) * - bn/voa/02/voa_0220 0 3 it PRP (SBAR(S(NP*) - - - - * (ARG1* (ARG1*) - bn/voa/02/voa_0220 0 4 's VBZ (VP* be 01 1 - * * (V*) - bn/voa/02/voa_0220 0 5 not RB * - - - - * * (ARGM-NEG*) - bn/voa/02/voa_0220 0 6 clear JJ (ADJP*) - - - - * * (ARG2*) - bn/voa/02/voa_0220 0 7 who WP (SBAR(WHNP*) - - - - * * * - bn/voa/02/voa_0220 0 8 was VBD (S(VP* be - 1 - * * * - bn/voa/02/voa_0220 0 9 responsible JJ (ADJP* - - - - * * * - bn/voa/02/voa_0220 0 10 for IN (PP* - - - - * * * - bn/voa/02/voa_0220 0 11 the DT (NP* - - - - * * * (0 bn/voa/02/voa_0220 0 12 massacre NN *)))))))))) massacre - - - * *) * 0) bn/voa/02/voa_0220 0 13 . . *)) - - - - * * * - #end document """ self.real_document = CoNLLDocument(self.real_example) self.complicated_mention_example = """#begin document (test2); part 000 test2 0 0 This NN (NP* - - - - - (0) test2 0 1 is NN * - - - - - - test2 0 2 just NN * - - - - - - test2 0 3 a NN * - - - - - (0|(1) test2 0 4 test NN * - - - - - 0) test2 0 5 . NN *) - - - - - - test2 0 0 It NN (NP* - - - - - (1)|(4 test2 0 1 shows NN * - - - - - - test2 0 2 that NN * - - - - - (2) test2 0 3 the NN * - - - - - (2|(3 test2 0 4 scorer NN * - - - - - 2)|4) test2 0 5 works NN * - - - - - 3) test2 0 6 . NN *) - - - - - - #end document""" self.complicated_mention_document = CoNLLDocument( self.complicated_mention_example) self.for_head_example = """#begin document (wb/a2e/00/a2e_0000); part 000 wb/a2e/00/a2e_0000 0 0 Celebration NN (TOP(S(NP* - - - - * (ARG0* - wb/a2e/00/a2e_0000 0 1 Shooting NN *) shoot - - - * *) - wb/a2e/00/a2e_0000 0 2 Turns VBZ (VP* turn 02 2 - * (V*) - wb/a2e/00/a2e_0000 0 3 Wedding NN (NP*) wed - - - * (ARG1*) (3) wb/a2e/00/a2e_0000 0 4 Into IN (PP* - - - - * (ARG2* - wb/a2e/00/a2e_0000 0 5 a DT (NP* - - - - * * - wb/a2e/00/a2e_0000 0 6 Funeral NN *) - - - - * * - wb/a2e/00/a2e_0000 0 7 in IN (PP* - - - - * * - wb/a2e/00/a2e_0000 0 8 Southern JJ (NP* - - - - * * (14 wb/a2e/00/a2e_0000 0 9 Gaza NNP * - - - - (GPE* * - wb/a2e/00/a2e_0000 0 10 Strip NNP *)))))) - - - - *) *) 14) #end document""" self.for_head_document = CoNLLDocument(self.for_head_example) self.date_mention_example = """#begin document (nw/wsj/24/wsj_2444); part 000 nw/wsj/24/wsj_2444 0 0 Employment NN (TOP(S(NP*) employment 01 1 - * (V*) (ARG1*) - nw/wsj/24/wsj_2444 0 1 is VBZ (VP* be 01 1 - * * (V*) - nw/wsj/24/wsj_2444 0 2 now RB (ADVP*) - - - - * * (ARGM-TMP*) - nw/wsj/24/wsj_2444 0 3 4 CD (ADJP(ADJP(QP* - - - - (PERCENT* * (ARG2* - nw/wsj/24/wsj_2444 0 4 % NN *) - - - - *) * * - nw/wsj/24/wsj_2444 0 5 higher JJR *) - - - - * * * - nw/wsj/24/wsj_2444 0 6 than IN (PP* - - - - * * * - nw/wsj/24/wsj_2444 0 7 in IN (PP* - - - - * * * - nw/wsj/24/wsj_2444 0 8 1983 CD (NP*))))) - - - - (DATE) * *) (16) nw/wsj/24/wsj_2444 0 9 . . *)) - - - - * * * - bc/cctv/00/cctv_0000 0 0 For IN (TOP(S(PP* - - - Speaker#1 * * (ARGM-TMP* - bc/cctv/00/cctv_0000 0 1 two CD (NP* - - - Speaker#1 (DATE* * * (1 bc/cctv/00/cctv_0000 0 2 years NNS *)) - - - Speaker#1 *) * *) 1) bc/cctv/00/cctv_0000 0 3 , , * - - - Speaker#1 * * * - bc/cctv/00/cctv_0000 0 4 Disney NNP (NP*) - - - Speaker#1 (ORG) * (ARG0*) (12) bc/cctv/00/cctv_0000 0 5 has VBZ (VP* have 01 - Speaker#1 * (V*) * - bc/cctv/00/cctv_0000 0 6 constantly RB (ADVP*) - - - Speaker#1 * * (ARGM-MNR*) - bc/cctv/00/cctv_0000 0 7 maintained VBN (VP* maintain 01 1 Speaker#1 * * (V*) - bc/cctv/00/cctv_0000 0 8 its PRP$ (NP* - - - Speaker#1 * * (ARG1* (12) bc/cctv/00/cctv_0000 0 9 mystery NN *))) - - - Speaker#1 * * *) - bc/cctv/00/cctv_0000 0 10 . . *)) - - - Speaker#1 * * * - nw/wsj/24/wsj_2413 0 0 Government NNP (TOP(S(NP(NP* - - - - * (ARG0* * (ARG0* (ARG0* * (16 nw/wsj/24/wsj_2413 0 1 officials NNS *) official - 1 - * * * * * * - nw/wsj/24/wsj_2413 0 2 here RB (UCP(ADVP*) - - - - * * * * * * - nw/wsj/24/wsj_2413 0 3 and CC * - - - - * * * * * * - nw/wsj/24/wsj_2413 0 4 in IN (PP* - - - - * * * * * * - nw/wsj/24/wsj_2413 0 5 other JJ (NP* - - - - * * * * * * - nw/wsj/24/wsj_2413 0 6 countries NNS *)))) country - 3 - * *) * *) *) * 16) nw/wsj/24/wsj_2413 0 7 laid VBD (VP(VP* lay 01 2 - * (V*) * * * * - nw/wsj/24/wsj_2413 0 8 plans NNS (NP(NP*) plan - 2 - * (ARG1* * * * * - nw/wsj/24/wsj_2413 0 9 through IN (PP* - - - - * * * * * * - nw/wsj/24/wsj_2413 0 10 the DT (NP* - - - - (DATE* * * * * * (6 nw/wsj/24/wsj_2413 0 11 weekend NN *)) weekend - - - *) * * * * * 6) nw/wsj/24/wsj_2413 0 12 to TO (S(VP* - - - - * (C-ARG1* * * * * - nw/wsj/24/wsj_2413 0 13 head VB (VP* head 03 6 - * * (V*) * * * - nw/wsj/24/wsj_2413 0 14 off RP (PRT*) - - - - * * * * * * - nw/wsj/24/wsj_2413 0 15 a DT (NP* - - - - * * (ARG1* * * * - nw/wsj/24/wsj_2413 0 16 Monday NNP * - - - - (DATE) * * * * * (8) nw/wsj/24/wsj_2413 0 17 market NN * market - 4 - * * * * * * - nw/wsj/24/wsj_2413 0 18 meltdown NN *)))))) - - - - * *)) *) * * * - nw/wsj/24/wsj_2413 0 19 -- : * - - - - * * * * * * - nw/wsj/24/wsj_2413 0 20 but CC * - - - - * * * * * * - nw/wsj/24/wsj_2413 0 21 went VBD (VP* go 02 1 - * * * (V*) * * - nw/wsj/24/wsj_2413 0 22 out IN (PP* - - - - * * * (ARGM-DIR* * * - nw/wsj/24/wsj_2413 0 23 of IN (PP* - - - - * * * * * * - nw/wsj/24/wsj_2413 0 24 their PRP$ (NP* - - - - * * * * * * (16) nw/wsj/24/wsj_2413 0 25 way NN *))) way - 5 - * * * *) * * - nw/wsj/24/wsj_2413 0 26 to TO (S(VP* - - - - * * * (ARGM-PNC* * * - nw/wsj/24/wsj_2413 0 27 keep VB (VP* keep 02 1 - * * * * (V*) * - nw/wsj/24/wsj_2413 0 28 their PRP$ (NP* - - - - * * * * * (ARG0*) (16) nw/wsj/24/wsj_2413 0 29 moves NNS *) move 02 2 - * * * * * (V*) - nw/wsj/24/wsj_2413 0 30 quiet JJ (S(ADJP*))))))) - - - - * * * *) (ARG1*) * - nw/wsj/24/wsj_2413 0 31 . . *)) - - - - * * * * * * - #end document""" self.date_mention_document = CoNLLDocument(self.date_mention_example)