Esempio n. 1
0
    def test_get_gold_spans_correctly_extracts_spans(self):
        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")

        span_dict = {}
        ptb_reader._get_gold_spans(tree, 0, span_dict)
        spans = list(span_dict.items()) # pylint: disable=protected-access
        assert spans == [((0, 1), 'NP'), ((3, 4), 'NP'), ((2, 4), 'VP'), ((0, 4), 'S')]
 def test_strip_functional_tags(self):
     ptb_reader = PennTreeBankConstituencySpanDatasetReader()
     # Get gold spans should strip off all the functional tags.
     tree = Tree.fromstring(
         "(S (NP=PRP (D the) (N dog)) (VP-0 (V chased) (NP|FUN-TAGS (D the) (N cat))))"
     )
     ptb_reader._strip_functional_tags(tree)
     assert tree == Tree.fromstring(
         "(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
    def test_get_gold_spans_correctly_extracts_spans(self):
        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        tree = Tree.fromstring(
            "(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")

        span_dict = {}
        ptb_reader._get_gold_spans(tree, 0, span_dict)
        spans = list(span_dict.items())
        assert spans == [((0, 1), "NP"), ((3, 4), "NP"), ((2, 4), "VP"),
                         ((0, 4), "S")]
    def test_get_gold_spans_correctly_extracts_spans_with_nested_labels(self):
        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        # Here we have a sentence fragment which has the same span with nested S and VP labels.
        # These should be concatenated into a single label by get_gold_spans.
        tree = Tree.fromstring("(S (VP (V chased) (NP (D the) (N cat))))")
        span_dict = {}
        ptb_reader._get_gold_spans(tree, 0, span_dict)
        spans = list(span_dict.items()) # pylint: disable=protected-access

        assert spans == [((0, 0), 'V-POS'), ((1, 1), 'D-POS'), ((2, 2), 'N-POS'),
                         ((1, 2), 'NP'), ((0, 2), 'S-VP')]
    def test_get_gold_spans_correctly_extracts_spans(self):
        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        # Get gold spans should strip off all the functional tags.
        tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")

        span_dict = {}
        ptb_reader._get_gold_spans(tree, 0, span_dict)
        spans = list(span_dict.items()) # pylint: disable=protected-access
        assert spans == [((0, 0), 'D-POS'), ((1, 1), 'N-POS'), ((0, 1), 'NP'),
                         ((2, 2), 'V-POS'), ((3, 3), 'D-POS'), ((4, 4), 'N-POS'),
                         ((3, 4), 'NP'), ((2, 4), 'VP'), ((0, 4), 'S')]
Esempio n. 6
0
 def test_get_gold_spans_correctly_extracts_spans_with_nested_labels(self):
     ptb_reader = PennTreeBankConstituencySpanDatasetReader()
     # Here we have a parse with several nested labels - particularly the (WHNP (WHNP (WP What)))
     # fragment. These should be concatenated into a single label by get_gold_spans.
     tree = Tree.fromstring("""
         (S
     (`` ``)
     (S-TPC
     (NP-SBJ (PRP We))
     (VP
         (VBP have)
         (S
         (VP
             (TO to)
             (VP
             (VP
                 (VB clear)
                 (PRT (RP up))
                 (NP (DT these) (NNS issues)))
             (CC and)
             (VP
                 (VB find)
                 (PRT (RP out))
                 (SBAR-NOM
                 (WHNP (WHNP (WP what)))
                 (S
                     (VP
                     (VBZ is)
                     (ADJP-PRD (JJ present))
                     (SBAR
                         (WHNP (WDT that))
                         (S
                         (VP
                             (VBZ is)
                             (VP
                             (VBG creating)
                             (NP (JJ artificial) (NN volatility)))))))))))))))
     (, ,)
     ('' '')
     (NP-SBJ (NNP Mr.) (NNP Fisher))
     (VP (VBD said))
     (. .))
     """)
     span_dict = {}
     ptb_reader._strip_functional_tags(tree) # pylint: disable=protected-access
     ptb_reader._get_gold_spans(tree, 0, span_dict) # pylint: disable=protected-access
     assert span_dict == {(1, 1): 'NP', (5, 5): 'PRT', (6, 7): 'NP', (4, 7): 'VP', (10, 10): 'PRT',
                          (11, 11): 'WHNP-WHNP', (13, 13): 'ADJP', (14, 14): 'WHNP', (17, 18): 'NP',
                          (16, 18): 'VP', (15, 18): 'S-VP', (14, 18): 'SBAR', (12, 18): 'S-VP',
                          (11, 18): 'SBAR', (9, 18): 'VP', (4, 18): 'VP', (3, 18): 'S-VP',
                          (2, 18): 'VP', (1, 18): 'S', (21, 22): 'NP', (23, 23): 'VP', (0, 24): 'S'}
    def test_read_from_file(self):

        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        instances = ptb_reader.read('tests/fixtures/data/example_ptb.trees')

        assert len(instances) == 2

        fields = instances[0].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end) for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == ['Also', ',', 'because', 'UAL', 'Chairman', 'Stephen', 'Wolf',
                          'and', 'other', 'UAL', 'executives', 'have', 'joined', 'the',
                          'pilots', "'", 'bid', ',', 'the', 'board', 'might', 'be', 'forced',
                          'to', 'exclude', 'him', 'from', 'its', 'deliberations', 'in',
                          'order', 'to', 'be', 'fair', 'to', 'other', 'bidders', '.']
        assert pos_tags == ['RB', ',', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', 'CC', 'JJ', 'NNP',
                            'NNS', 'VBP', 'VBN', 'DT', 'NNS', 'POS', 'NN', ',', 'DT', 'NN',
                            'MD', 'VB', 'VBN', 'TO', 'VB', 'PRP', 'IN', 'PRP$',
                            'NNS', 'IN', 'NN', 'TO', 'VB', 'JJ', 'TO', 'JJ', 'NNS', '.']

        assert spans == enumerate_spans(tokens)
        gold_tree = Tree.fromstring("(VROOT(S(ADVP(RB Also))(, ,)(SBAR(IN because)"
                                    "(S(NP(NP(NNP UAL)(NNP Chairman)(NNP Stephen)(NNP Wolf))"
                                    "(CC and)(NP(JJ other)(NNP UAL)(NNS executives)))(VP(VBP have)"
                                    "(VP(VBN joined)(NP(NP(DT the)(NNS pilots)(POS '))(NN bid))))))"
                                    "(, ,)(NP(DT the)(NN board))(VP(MD might)(VP(VB be)(VP(VBN "
                                    "forced)(S(VP(TO to)(VP(VB exclude)(NP(PRP him))(PP(IN from)"
                                    "(NP(PRP$ its)(NNS deliberations)))(SBAR(IN in)(NN order)(S("
                                    "VP(TO to)(VP(VB be)(ADJP(JJ fair)(PP(TO to)(NP(JJ other)(NNS "
                                    "bidders))))))))))))))(. .)))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label


        fields = instances[1].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end) for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == ['That', 'could', 'cost', 'him', 'the', 'chance',
                          'to', 'influence', 'the', 'outcome', 'and', 'perhaps',
                          'join', 'the', 'winning', 'bidder', '.']

        assert pos_tags == ['DT', 'MD', 'VB', 'PRP', 'DT', 'NN',
                            'TO', 'VB', 'DT', 'NN', 'CC', 'RB', 'VB', 'DT',
                            'VBG', 'NN', '.']

        assert spans == enumerate_spans(tokens)

        gold_tree = Tree.fromstring("(VROOT(S(NP(DT That))(VP(MD could)(VP(VB cost)(NP(PRP him))"
                                    "(NP(DT the)(NN chance)(S(VP(TO to)(VP(VP(VB influence)(NP(DT the)"
                                    "(NN outcome)))(CC and)(VP(ADVP(RB perhaps))(VB join)(NP(DT the)"
                                    "(VBG winning)(NN bidder)))))))))(. .)))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label
    def test_read_from_file(self):

        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        instances = ptb_reader.read(
            str(self.FIXTURES_ROOT / "data" / "example_ptb.trees"))

        assert len(instances) == 2

        fields = instances[0].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end)
                 for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == [
            "Also",
            ",",
            "because",
            "UAL",
            "Chairman",
            "Stephen",
            "Wolf",
            "and",
            "other",
            "UAL",
            "executives",
            "have",
            "joined",
            "the",
            "pilots",
            "'",
            "bid",
            ",",
            "the",
            "board",
            "might",
            "be",
            "forced",
            "to",
            "exclude",
            "him",
            "from",
            "its",
            "deliberations",
            "in",
            "order",
            "to",
            "be",
            "fair",
            "to",
            "other",
            "bidders",
            ".",
        ]
        assert pos_tags == [
            "RB",
            ",",
            "IN",
            "NNP",
            "NNP",
            "NNP",
            "NNP",
            "CC",
            "JJ",
            "NNP",
            "NNS",
            "VBP",
            "VBN",
            "DT",
            "NNS",
            "POS",
            "NN",
            ",",
            "DT",
            "NN",
            "MD",
            "VB",
            "VBN",
            "TO",
            "VB",
            "PRP",
            "IN",
            "PRP$",
            "NNS",
            "IN",
            "NN",
            "TO",
            "VB",
            "JJ",
            "TO",
            "JJ",
            "NNS",
            ".",
        ]

        assert spans == enumerate_spans(tokens)
        gold_tree = Tree.fromstring(
            "(S(ADVP(RB Also))(, ,)(SBAR(IN because)"
            "(S(NP(NP(NNP UAL)(NNP Chairman)(NNP Stephen)(NNP Wolf))"
            "(CC and)(NP(JJ other)(NNP UAL)(NNS executives)))(VP(VBP have)"
            "(VP(VBN joined)(NP(NP(DT the)(NNS pilots)(POS '))(NN bid))))))"
            "(, ,)(NP(DT the)(NN board))(VP(MD might)(VP(VB be)(VP(VBN "
            "forced)(S(VP(TO to)(VP(VB exclude)(NP(PRP him))(PP(IN from)"
            "(NP(PRP$ its)(NNS deliberations)))(SBAR(IN in)(NN order)(S("
            "VP(TO to)(VP(VB be)(ADJP(JJ fair)(PP(TO to)(NP(JJ other)(NNS "
            "bidders))))))))))))))(. .))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label

        fields = instances[1].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end)
                 for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == [
            "That",
            "could",
            "cost",
            "him",
            "the",
            "chance",
            "to",
            "influence",
            "the",
            "outcome",
            "and",
            "perhaps",
            "join",
            "the",
            "winning",
            "bidder",
            ".",
        ]

        assert pos_tags == [
            "DT",
            "MD",
            "VB",
            "PRP",
            "DT",
            "NN",
            "TO",
            "VB",
            "DT",
            "NN",
            "CC",
            "RB",
            "VB",
            "DT",
            "VBG",
            "NN",
            ".",
        ]

        assert spans == enumerate_spans(tokens)

        gold_tree = Tree.fromstring(
            "(S(NP(DT That))(VP(MD could)(VP(VB cost)(NP(PRP him))"
            "(NP(DT the)(NN chance)(S(VP(TO to)(VP(VP(VB influence)(NP(DT the)"
            "(NN outcome)))(CC and)(VP(ADVP(RB perhaps))(VB join)(NP(DT the)"
            "(VBG winning)(NN bidder)))))))))(. .))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label
 def test_strip_functional_tags(self):
     ptb_reader = PennTreeBankConstituencySpanDatasetReader()
     # Get gold spans should strip off all the functional tags.
     tree = Tree.fromstring("(S (NP=PRP (D the) (N dog)) (VP-0 (V chased) (NP|FUN-TAGS (D the) (N cat))))")
     ptb_reader._strip_functional_tags(tree)
     assert tree == Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
    def test_read_from_file(self):

        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        instances = ptb_reader.read(str(self.FIXTURES_ROOT / 'data' / 'example_ptb.trees'))

        assert len(instances) == 2

        fields = instances[0].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end) for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == ['Also', ',', 'because', 'UAL', 'Chairman', 'Stephen', 'Wolf',
                          'and', 'other', 'UAL', 'executives', 'have', 'joined', 'the',
                          'pilots', "'", 'bid', ',', 'the', 'board', 'might', 'be', 'forced',
                          'to', 'exclude', 'him', 'from', 'its', 'deliberations', 'in',
                          'order', 'to', 'be', 'fair', 'to', 'other', 'bidders', '.']
        assert pos_tags == ['RB', ',', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', 'CC', 'JJ', 'NNP',
                            'NNS', 'VBP', 'VBN', 'DT', 'NNS', 'POS', 'NN', ',', 'DT', 'NN',
                            'MD', 'VB', 'VBN', 'TO', 'VB', 'PRP', 'IN', 'PRP$',
                            'NNS', 'IN', 'NN', 'TO', 'VB', 'JJ', 'TO', 'JJ', 'NNS', '.']

        assert spans == enumerate_spans(tokens)
        gold_tree = Tree.fromstring("(S(ADVP(RB Also))(, ,)(SBAR(IN because)"
                                    "(S(NP(NP(NNP UAL)(NNP Chairman)(NNP Stephen)(NNP Wolf))"
                                    "(CC and)(NP(JJ other)(NNP UAL)(NNS executives)))(VP(VBP have)"
                                    "(VP(VBN joined)(NP(NP(DT the)(NNS pilots)(POS '))(NN bid))))))"
                                    "(, ,)(NP(DT the)(NN board))(VP(MD might)(VP(VB be)(VP(VBN "
                                    "forced)(S(VP(TO to)(VP(VB exclude)(NP(PRP him))(PP(IN from)"
                                    "(NP(PRP$ its)(NNS deliberations)))(SBAR(IN in)(NN order)(S("
                                    "VP(TO to)(VP(VB be)(ADJP(JJ fair)(PP(TO to)(NP(JJ other)(NNS "
                                    "bidders))))))))))))))(. .))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label


        fields = instances[1].fields
        tokens = [x.text for x in fields["tokens"].tokens]
        pos_tags = fields["pos_tags"].labels
        spans = [(x.span_start, x.span_end) for x in fields["spans"].field_list]
        span_labels = fields["span_labels"].labels

        assert tokens == ['That', 'could', 'cost', 'him', 'the', 'chance',
                          'to', 'influence', 'the', 'outcome', 'and', 'perhaps',
                          'join', 'the', 'winning', 'bidder', '.']

        assert pos_tags == ['DT', 'MD', 'VB', 'PRP', 'DT', 'NN',
                            'TO', 'VB', 'DT', 'NN', 'CC', 'RB', 'VB', 'DT',
                            'VBG', 'NN', '.']

        assert spans == enumerate_spans(tokens)

        gold_tree = Tree.fromstring("(S(NP(DT That))(VP(MD could)(VP(VB cost)(NP(PRP him))"
                                    "(NP(DT the)(NN chance)(S(VP(TO to)(VP(VP(VB influence)(NP(DT the)"
                                    "(NN outcome)))(CC and)(VP(ADVP(RB perhaps))(VB join)(NP(DT the)"
                                    "(VBG winning)(NN bidder)))))))))(. .))")

        assert fields["metadata"].metadata["gold_tree"] == gold_tree
        assert fields["metadata"].metadata["tokens"] == tokens

        correct_spans_and_labels = {}
        ptb_reader._get_gold_spans(gold_tree, 0, correct_spans_and_labels)
        for span, label in zip(spans, span_labels):
            if label != "NO-LABEL":
                assert correct_spans_and_labels[span] == label