Ejemplo n.º 1
0
def main(args):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ontonotes",
        type=str,
        required=True,
        help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0",
    )
    parser.add_argument("--tasks",
                        type=str,
                        nargs="+",
                        help="Tasks, one or more of {const, coref, ner, srl}.")
    parser.add_argument(
        "--splits",
        type=str,
        nargs="+",
        default=["train", "development", "test", "conll-2012-test"],
        help=
        "Splits, one or more of {train, development, test, conll-2012-test}.",
    )
    parser.add_argument("-o",
                        dest="output_dir",
                        type=str,
                        default=".",
                        help="Output directory for JSON files.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    import pandas as pd

    pd.options.display.float_format = "{:.2f}".format

    # Load OntoNotes reader.
    ontonotes = Ontonotes()
    for split in args.splits:
        for task in args.tasks:
            source_path = os.path.join(args.ontonotes, "data", split)
            print('########### Reading ontonotes split from', source_path)
            ontonotes_reader = ontonotes.dataset_iterator(
                file_path=source_path)

            log.info("Processing split '%s' for task '%s'", split, task)
            task_dir = os.path.join(args.output_dir, task)
            if not os.path.isdir(task_dir):
                os.mkdir(task_dir)
            target_fname = os.path.join(task_dir, f"{split}.json")
            ontonotes_stats = collections.Counter()
            converted_records = process_task_split(tqdm(ontonotes_reader),
                                                   task, ontonotes_stats)

            stats = utils.EdgeProbingDatasetStats()
            converted_records = stats.passthrough(converted_records)
            utils.write_json_data(target_fname, converted_records)
            log.info("Wrote examples to %s", target_fname)
            log.info(stats.format())
            log.info(str(pd.Series(ontonotes_stats, dtype=object)))
Ejemplo n.º 2
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                instances.append(
                    self.text_to_instance(tokens, verb_label, tags))
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    instances.append(
                        self.text_to_instance(tokens, verb_indicator, tags))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Ejemplo n.º 3
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Ejemplo n.º 4
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s",
            'confession', 'was', 'forced', '.'
        ]
        assert annotation.pos_tags == [
            'NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ',
            '.'
        ]
        assert annotation.word_senses == [
            None, None, 1, 1, None, 2, None, None, 1, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, '01', None, None
        ]
        assert annotation.srl_frames == [("say", [
            'B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1',
            'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'
        ]),
                                         ("was", [
                                             'O', 'O', 'O', 'O', 'B-ARG1',
                                             'I-ARG1', 'I-ARG1', 'I-ARG1',
                                             'B-V', 'B-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, 'official', 'say', None, 'man', None, None, 'be', None,
            None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
            'after', 'four', 'months', 'of', 'hearings', '.'
        ]
        assert annotation.pos_tags == [
            'DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS',
            'IN', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, '01', None, None, None, None, None, None, None, None,
            '01', None
        ]
        assert annotation.srl_frames == [('rested', [
            'B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP',
            'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
            'I-ARGM-TMP', 'I-ARGM-TMP', 'O'
        ]),
                                         ('hearings', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'B-V', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE',
            'I-DATE', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, 'prosecution', 'rest', None, 'case', None, None, None, None,
            'month', None, 'hearing', None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None,
            None, None
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Denise', 'Dillon', 'Headline', 'News', '.'
        ]
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [
            None, None, None, None, None
        ]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            'B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == 'test/test/04/test_004'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as',
            'it', 'is', 'with', 'all', 'children', '.'
        ]
        assert annotation.pos_tags == [
            'CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP',
            'VBZ', 'IN', 'DT', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, None, None, 4.0, None, None, None, None, None, None, 5.0,
            None, None, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, None, None, '01',
            None, None, None, None
        ]
        assert annotation.srl_frames == [('is', [
            'B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2',
            'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV',
            'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O'
        ]),
                                         ('is', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'B-ARG1', 'B-V',
                                             'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
            'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, None, 'be', None, None, None, None, None, None, 'be',
            None, None, None, None
        ]
        assert annotation.speakers == [
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_'
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
Ejemplo n.º 5
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman',
                                    "'s", 'confession', 'was', 'forced', '.']
        assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT',
                                       'NN', 'POS', 'NN', 'VBD', 'JJ', '.']
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None,
                                                     None, None, None, '01', None, None]
        assert annotation.srl_frames == [("say", ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                  'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O']),
                                         ("was", ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1',
                                                  'I-ARG1', 'B-V', 'B-ARG2', 'O'])]
        assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, 'official', 'say', None,
                                               'man', None, None, 'be', None, None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None]

        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali)  (NN government) )"
                                                        " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
                                                        " (DT the)  (NN woman)  (POS 's) ) (NN "
                                                        "confession) )(VP (VBD was) (ADJP (JJ "
                                                        "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
                                    'after', 'four', 'months', 'of', 'hearings', '.']
        assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN',
                                       'IN', 'CD', 'NNS', 'IN', 'NNS', '.']
        assert annotation.word_senses == [None, 2, 5, None, 2, None, None,
                                          None, None, 1, None, 1, None]
        assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None,
                                                     None, None, None, None, None, '01', None]
        assert annotation.srl_frames == [('rested', ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                     'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP',
                                                     'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                                     'I-ARGM-TMP', 'I-ARGM-TMP', 'O']),
                                         ('hearings', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                       'O', 'O', 'O', 'B-V', 'O'])]
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE',
                                             'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case',
                                               None, None, None, None, 'month', None, 'hearing', None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The)  (NN prosecution) )(VP "
                                                        "(VBD rested) (NP (PRP$ its)  (NN case) )"
                                                        "(NP (JJ last)  (NN month) )(PP (IN after) "
                                                        "(NP(NP (CD four)  (NNS months) )(PP (IN"
                                                        " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.']
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == []
        assert annotation.named_entities == ['B-PERSON', 'I-PERSON',
                                             'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) "
                                                        " (NNP Dillon) )(NP (NNP Headline)  "
                                                        "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == 'test/test/04/test_004'
        assert annotation.sentence_id == 0
        assert annotation.words == ['and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',',
                                    'as', 'it', 'is', 'with', 'all', 'children', '.']
        assert annotation.pos_tags == ['CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',',
                                       'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NNS', '.']
        assert annotation.word_senses == [None, None, None, 4.0, None, None, None, None,
                                          None, None, 5.0, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None,
                                                     None, None, None, None, '01', None, None, None, None]
        assert annotation.srl_frames == [('is', ['B-ARGM-DIS', 'B-ARG1', 'I-ARG1',
                                                 'B-V', 'B-ARGM-TMP', 'B-ARG2', 'I-ARG2',
                                                 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV',
                                                 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O']),
                                         ('is', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                 'B-ARG1', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'])]
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, None, 'be', None, None, None,
                                               None, None, None, 'be', None, None, None, None]
        assert annotation.speakers == ['_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
                                       '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
                                       '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_']
        assert annotation.parse_tree == Tree.fromstring("(TOP (S (CC and) (NP (DT that) (NN wildness)) "
                                                        "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
                                                        "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
                                                        "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
                                                        "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
Ejemplo n.º 6
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman',
                                    "'s", 'confession', 'was', 'forced', '.']
        assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT',
                                       'NN', 'POS', 'NN', 'VBD', 'JJ', '.']
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None,
                                                     None, None, None, '01', None, None]
        assert annotation.srl_frames == {"say": ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'],
                                         "was": ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1',
                                                 'I-ARG1', 'B-V', 'B-ARG2', 'O']}
        assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, 'official', 'say', None,
                                               'man', None, None, 'be', None, None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None]

        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali)  (NN government) )"
                                                        " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
                                                        " (DT the)  (NN woman)  (POS 's) ) (NN "
                                                        "confession) )(VP (VBD was) (ADJP (JJ "
                                                        "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
                                    'after', 'four', 'months', 'of', 'hearings', '.']
        assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN',
                                       'IN', 'CD', 'NNS', 'IN', 'NNS', '.']
        assert annotation.word_senses == [None, 2, 5, None, 2, None, None,
                                          None, None, 1, None, 1, None]
        assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None,
                                                     None, None, None, None, None, '01', None]
        assert annotation.srl_frames == {'rested': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                    'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP',
                                                    'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                                    'I-ARGM-TMP', 'I-ARGM-TMP', 'O'],
                                         'hearings': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                      'O', 'O', 'O', 'B-V', 'O']}
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE',
                                             'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case',
                                               None, None, None, None, 'month', None, 'hearing', None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The)  (NN prosecution) )(VP "
                                                        "(VBD rested) (NP (PRP$ its)  (NN case) )"
                                                        "(NP (JJ last)  (NN month) )(PP (IN after) "
                                                        "(NP(NP (CD four)  (NNS months) )(PP (IN"
                                                        " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.']
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == {}
        assert annotation.named_entities == ['B-PERSON', 'I-PERSON',
                                             'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) "
                                                        " (NNP Dillon) )(NP (NNP Headline)  "
                                                        "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}
Ejemplo n.º 7
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator(self.FIXTURES_ROOT / "conll_2012" / "subdomain")
        )
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "Mali",
            "government",
            "officials",
            "say",
            "the",
            "woman",
            "'s",
            "confession",
            "was",
            "forced",
            ".",
        ]
        assert annotation.pos_tags == [
            "NNP",
            "NN",
            "NNS",
            "VBP",
            "DT",
            "NN",
            "POS",
            "NN",
            "VBD",
            "JJ",
            ".",
        ]
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "say",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "O",
                ],
            ),
            (
                "was",
                ["O", "O", "O", "O", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "B-V", "B-ARG2", "O"],
            ),
        ]
        assert annotation.named_entities == [
            "B-GPE",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            "official",
            "say",
            None,
            "man",
            None,
            None,
            "be",
            None,
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))"
        )
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "The",
            "prosecution",
            "rested",
            "its",
            "case",
            "last",
            "month",
            "after",
            "four",
            "months",
            "of",
            "hearings",
            ".",
        ]
        assert annotation.pos_tags == [
            "DT",
            "NN",
            "VBD",
            "PRP$",
            "NN",
            "JJ",
            "NN",
            "IN",
            "CD",
            "NNS",
            "IN",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            2,
            5,
            None,
            2,
            None,
            None,
            None,
            None,
            1,
            None,
            1,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
        ]
        assert annotation.srl_frames == [
            (
                "rested",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "O",
                ],
            ),
            ("hearings", ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-V", "O"]),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            "prosecution",
            "rest",
            None,
            "case",
            None,
            None,
            None,
            None,
            "month",
            None,
            "hearing",
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))"
        )
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == "test/test/03/test_003"
        assert annotation.sentence_id == 0
        assert annotation.words == ["Denise", "Dillon", "Headline", "News", "."]
        assert annotation.pos_tags == ["NNP", "NNP", "NNP", "NNP", "."]
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            "B-PERSON",
            "I-PERSON",
            "B-WORK_OF_ART",
            "I-WORK_OF_ART",
            "O",
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))"
        )
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == "test/test/04/test_004"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "and",
            "that",
            "wildness",
            "is",
            "still",
            "in",
            "him",
            ",",
            "as",
            "it",
            "is",
            "with",
            "all",
            "children",
            ".",
        ]
        assert annotation.pos_tags == [
            "CC",
            "DT",
            "NN",
            "VBZ",
            "RB",
            "IN",
            "PRP",
            ",",
            "IN",
            "PRP",
            "VBZ",
            "IN",
            "DT",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            None,
            None,
            4.0,
            None,
            None,
            None,
            None,
            None,
            None,
            5.0,
            None,
            None,
            None,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "is",
                [
                    "B-ARGM-DIS",
                    "B-ARG1",
                    "I-ARG1",
                    "B-V",
                    "B-ARGM-TMP",
                    "B-ARG2",
                    "I-ARG2",
                    "O",
                    "B-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "O",
                ],
            ),
            (
                "is",
                [
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "B-ARG1",
                    "B-V",
                    "B-ARG2",
                    "I-ARG2",
                    "I-ARG2",
                    "O",
                ],
            ),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
        ]
        assert annotation.speakers == [
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))"
        )
        assert annotation.coref_spans == {(14, (6, 6))}