Ejemplo n.º 1
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                instances.append(
                    self.text_to_instance(tokens, verb_label, tags))
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    instances.append(
                        self.text_to_instance(tokens, verb_indicator, tags))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Ejemplo n.º 2
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / 'conll_2012'))
     expected_paths = [str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain' / 'example.gold_conll'),
                       str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain2' / 'example.gold_conll')]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
Ejemplo n.º 3
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        i = 0
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            percent_user_spans = 0.0
            if self._simulate_user_inputs and i >= self._fully_labelled_threshold:
                percent_user_spans = 1.0

            i += 1

            yield self.text_to_instance([s.words for s in sentences],
                                        sentences[0].document_id,
                                        sentences[0].sentence_id,
                                        canonical_clusters, percent_user_spans)
Ejemplo n.º 4
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        ontonotes_reader = Ontonotes()
        for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            instance = self.text_to_instance([s.words for s in sentences], canonical_clusters)
            instances.append(instance)

        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Ejemplo n.º 5
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            speakers = []
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

                speakers.append(sentence.speakers)

            doc_key = sentences[0].document_id
            genre = self.genres[doc_key[:2]]

            speakers = self.flatten(speakers)
            assert total_tokens == len(speakers)

            speaker_dict = {s: i for i, s in enumerate(set(speakers))}
            speaker_ids = np.array([speaker_dict[s] for s in speakers])

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
Ejemplo n.º 6
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            new_sentences = [s.words for s in sentences]
            flattened_sentences = [
                self._normalize_word(word) for sentence in new_sentences
                for word in sentence
            ]

            def tokenizer(s: str):
                return self.token_indexer.wordpiece_tokenizer(s)

            flattened_sentences = tokenizer(" ".join(flattened_sentences))
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
Ejemplo n.º 7
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    # for i in range(len(tags)):
                    #     if tags[i] != 'O':
                    # tags[i] = 'I-ARG1'
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Ejemplo n.º 8
0
def main(args):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ontonotes",
        type=str,
        required=True,
        help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0",
    )
    parser.add_argument("--tasks",
                        type=str,
                        nargs="+",
                        help="Tasks, one or more of {const, coref, ner, srl}.")
    parser.add_argument(
        "--splits",
        type=str,
        nargs="+",
        default=["train", "development", "test", "conll-2012-test"],
        help=
        "Splits, one or more of {train, development, test, conll-2012-test}.",
    )
    parser.add_argument("-o",
                        dest="output_dir",
                        type=str,
                        default=".",
                        help="Output directory for JSON files.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    import pandas as pd

    pd.options.display.float_format = "{:.2f}".format

    # Load OntoNotes reader.
    ontonotes = Ontonotes()
    for split in args.splits:
        for task in args.tasks:
            source_path = os.path.join(args.ontonotes, "data", split)
            print('########### Reading ontonotes split from', source_path)
            ontonotes_reader = ontonotes.dataset_iterator(
                file_path=source_path)

            log.info("Processing split '%s' for task '%s'", split, task)
            task_dir = os.path.join(args.output_dir, task)
            if not os.path.isdir(task_dir):
                os.mkdir(task_dir)
            target_fname = os.path.join(task_dir, f"{split}.json")
            ontonotes_stats = collections.Counter()
            converted_records = process_task_split(tqdm(ontonotes_reader),
                                                   task, ontonotes_stats)

            stats = utils.EdgeProbingDatasetStats()
            converted_records = stats.passthrough(converted_records)
            utils.write_json_data(target_fname, converted_records)
            log.info("Wrote examples to %s", target_fname)
            log.info(stats.format())
            log.info(str(pd.Series(ontonotes_stats, dtype=object)))
Ejemplo n.º 9
0
    def _read(self, file_path: str):

        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):

            pos_tags = [t for t in sentence.pos_tags]

            tokens = [
                Token(t, None, None, pos_tags[i])
                for i, t in enumerate(sentence.words)
            ]

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    verb_indices = np.where(np.array(verb_indicator) == 1)[0]

                    if len(verb_indices) > 0:
                        verb_index = int(verb_indices[0])
                        verb = tokens[verb_index]
                    else:
                        verb_index = -1
                        verb = ''

                    for i, tag in enumerate(tags):
                        if tag[0] == 'B':
                            tags[i] = tags[i].replace('B', 'I', 1)
                        if self.used_tags is not None and tags[
                                i] not in self.used_tags:
                            tags[i] = 'O'

                    instance = self.text_to_instance([verb] + tokens,
                                                     [0] + verb_indicator,
                                                     ['O'] + tags)

                    if self.dependency_parse:
                        doc = self.nlp(' '.join(sentence.words))
                        instance.add_field('dependency', MetadataField(doc))

                    instance.add_field(
                        'verb_index', IndexField(verb_index,
                                                 instance['tokens']))
                    yield instance
    def _read(self, file_path: str):
        """OntoNotes custom reader to load spans from dependency pares tree as well"""
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):

            # skip samples without dep' parse tree
            if not sentence.parse_tree:
                continue

            # extract dep' parse tree spans
            spans = set()
            for subtree in sentence.parse_tree.subtrees():
                if subtree.height() > 0:
                    # TODO: check how to output indices instead of words
                    #  (for extreme cases where different tuples could match)
                    spans.add(tuple(subtree.leaves()))

            tokens = [Token(t) for t in sentence.words]
            if sentence.srl_frames:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance_with_spans(
                        tokens, verb_indicator, tags, spans)
Ejemplo n.º 11
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     assert files == [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
Ejemplo n.º 12
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / "conll_2012"))
     expected_paths = [
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain" / "example.gold_conll"),
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain2" / "example.gold_conll"),
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
Ejemplo n.º 13
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     expected_paths = [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
Ejemplo n.º 14
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        #Pdb().set_trace()
        data_split = os.path.basename(os.path.normpath(file_path))

        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        # Set random seed if percent is not 100
        if (self.percent_data < 100):
            random.seed(self.random_data_seed)

        # Write sentence, parse tree, span matrix to file
        # fout = open(f"srl_spans_{data_split}.pkl", "wb")

        print(f"return_labels: {self.return_labels}")

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            if (self.percent_data < 100 and data_split == "train"):
                select_data = random.randint(1, 101)
                if (select_data > self.percent_data):
                    continue
            tokens = [Token(t) for t in sentence.words]
            parseTree = sentence.parse_tree

            # Convert tree to span list

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                verb_label = [0 for _ in tokens]
                if self.return_labels:
                    tags = ["O" for _ in tokens]
                    yield self.text_to_instance(tokens, verb_label, parseTree,
                                                tags)
                else:
                    yield self.text_to_instance(tokens, verb_label, parseTree,
                                                None)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    if self.return_labels:
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    parseTree, tags)
                    else:
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    parseTree, None)
Ejemplo n.º 15
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            ##########################
            result = self.dependency_tree_predictor.predict(
                sentence=" ".join(sentence.words))
            # print(result['words'])
            root_dict = result['hierplane_tree']['root']
            adj = {}
            self.traverse_tree(adj, root_dict['word'], root_dict)
            predicte_adj = {}
            #########################
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, adj, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    #############################################
                    verb_index = verb_indicator.index(1)
                    predicte = sentence.words[verb_index]

                    if predicte in adj:
                        predicte_adj[predicte] = adj[predicte]
                        # 这里可能会造成死循环
                        for i in predicte_adj[predicte]:
                            if i in adj:
                                for j in adj[i]:
                                    predicte_adj[predicte].append(j)
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    predicte_adj, tags)
                    else:
                        # print(" ".join(sentence.words))
                        # print(adj)
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    adj, tags)
Ejemplo n.º 16
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(
            "Reading Fine-Grained NER instances from dataset files at: %s",
            file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(_normalize_word(t)) for t in sentence.words]
            yield self.text_to_instance(tokens, sentence.named_entities)
Ejemplo n.º 17
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Ejemplo n.º 18
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            ##########################
            result = self.dependency_tree_predictor.predict(
                sentence=" ".join(sentence.words))
            predicted_heads = result["predicted_heads"]
            #########################
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, adj, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    verb_index = verb_indicator.index(1)
                    # #############################################
                    adj = {}
                    self.traverse_predicted_heads(adj, predicted_heads,
                                                  verb_index + 1)
                    # 有些动词没有关系,防止在后面listfield中出错
                    adj[verb_index + 1].append(verb_index + 1)
                    ##############################################
                    # verb_index = verb_indicator.index(1)
                    # for i in range(len(tags)):
                    #     if '0' in tags[i]:
                    #         tags[i] = 'B-ARG0'
                    #     elif tags[i] != 'O' and i != verb_index:
                    #         tags[i] = 'B-ARG1'
                    yield self.text_to_instance(tokens, verb_indicator, adj,
                                                tags)
Ejemplo n.º 19
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        logger.info(
            "Reading SRL instances along with constituent parse from data files at: %s",
            file_path)

        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            parse = sentence.parse_tree
            if parse:
                pos_tags = [x[1] for x in parse.pos()]
                # yield self.text_to_instance(parse.leaves(), [x[1] for x in parse.pos()], parse)
            else:
                # parse information is missing for this sentence
                parse = None
                pos_tags = None

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags, pos_tags,
                                            parse)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance(tokens, verb_indicator, tags,
                                                pos_tags, parse)
Ejemplo n.º 20
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
Ejemplo n.º 21
0
    def _read(self, file_path: str):
        file_path = cached_path(
            file_path)  # if `file_path` is a URL, redirect to the cache
        ontonotes_reader = Ontonotes()
        logger.info("Reading NER instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.named_entities:
                tags = ["O" for _ in tokens]
            else:
                tags = sentence.named_entities

            if self._coding_scheme == "BIOUL":
                tags = iob1_to_bioul(tags)

            yield self.text_to_instance(tokens, tags)
Ejemplo n.º 22
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman',
                                    "'s", 'confession', 'was', 'forced', '.']
        assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT',
                                       'NN', 'POS', 'NN', 'VBD', 'JJ', '.']
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None,
                                                     None, None, None, '01', None, None]
        assert annotation.srl_frames == {"say": ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'],
                                         "was": ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1',
                                                 'I-ARG1', 'B-V', 'B-ARG2', 'O']}
        assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, 'official', 'say', None,
                                               'man', None, None, 'be', None, None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None]

        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali)  (NN government) )"
                                                        " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
                                                        " (DT the)  (NN woman)  (POS 's) ) (NN "
                                                        "confession) )(VP (VBD was) (ADJP (JJ "
                                                        "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
                                    'after', 'four', 'months', 'of', 'hearings', '.']
        assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN',
                                       'IN', 'CD', 'NNS', 'IN', 'NNS', '.']
        assert annotation.word_senses == [None, 2, 5, None, 2, None, None,
                                          None, None, 1, None, 1, None]
        assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None,
                                                     None, None, None, None, None, '01', None]
        assert annotation.srl_frames == {'rested': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                    'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP',
                                                    'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                                    'I-ARGM-TMP', 'I-ARGM-TMP', 'O'],
                                         'hearings': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                      'O', 'O', 'O', 'B-V', 'O']}
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE',
                                             'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case',
                                               None, None, None, None, 'month', None, 'hearing', None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The)  (NN prosecution) )(VP "
                                                        "(VBD rested) (NP (PRP$ its)  (NN case) )"
                                                        "(NP (JJ last)  (NN month) )(PP (IN after) "
                                                        "(NP(NP (CD four)  (NNS months) )(PP (IN"
                                                        " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.']
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == {}
        assert annotation.named_entities == ['B-PERSON', 'I-PERSON',
                                             'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) "
                                                        " (NNP Dillon) )(NP (NNP Headline)  "
                                                        "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}
Ejemplo n.º 23
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator(self.FIXTURES_ROOT / "conll_2012" / "subdomain")
        )
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "Mali",
            "government",
            "officials",
            "say",
            "the",
            "woman",
            "'s",
            "confession",
            "was",
            "forced",
            ".",
        ]
        assert annotation.pos_tags == [
            "NNP",
            "NN",
            "NNS",
            "VBP",
            "DT",
            "NN",
            "POS",
            "NN",
            "VBD",
            "JJ",
            ".",
        ]
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "say",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "O",
                ],
            ),
            (
                "was",
                ["O", "O", "O", "O", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "B-V", "B-ARG2", "O"],
            ),
        ]
        assert annotation.named_entities == [
            "B-GPE",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            "official",
            "say",
            None,
            "man",
            None,
            None,
            "be",
            None,
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))"
        )
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "The",
            "prosecution",
            "rested",
            "its",
            "case",
            "last",
            "month",
            "after",
            "four",
            "months",
            "of",
            "hearings",
            ".",
        ]
        assert annotation.pos_tags == [
            "DT",
            "NN",
            "VBD",
            "PRP$",
            "NN",
            "JJ",
            "NN",
            "IN",
            "CD",
            "NNS",
            "IN",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            2,
            5,
            None,
            2,
            None,
            None,
            None,
            None,
            1,
            None,
            1,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
        ]
        assert annotation.srl_frames == [
            (
                "rested",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "O",
                ],
            ),
            ("hearings", ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-V", "O"]),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            "prosecution",
            "rest",
            None,
            "case",
            None,
            None,
            None,
            None,
            "month",
            None,
            "hearing",
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))"
        )
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == "test/test/03/test_003"
        assert annotation.sentence_id == 0
        assert annotation.words == ["Denise", "Dillon", "Headline", "News", "."]
        assert annotation.pos_tags == ["NNP", "NNP", "NNP", "NNP", "."]
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            "B-PERSON",
            "I-PERSON",
            "B-WORK_OF_ART",
            "I-WORK_OF_ART",
            "O",
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))"
        )
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == "test/test/04/test_004"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "and",
            "that",
            "wildness",
            "is",
            "still",
            "in",
            "him",
            ",",
            "as",
            "it",
            "is",
            "with",
            "all",
            "children",
            ".",
        ]
        assert annotation.pos_tags == [
            "CC",
            "DT",
            "NN",
            "VBZ",
            "RB",
            "IN",
            "PRP",
            ",",
            "IN",
            "PRP",
            "VBZ",
            "IN",
            "DT",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            None,
            None,
            4.0,
            None,
            None,
            None,
            None,
            None,
            None,
            5.0,
            None,
            None,
            None,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "is",
                [
                    "B-ARGM-DIS",
                    "B-ARG1",
                    "I-ARG1",
                    "B-V",
                    "B-ARGM-TMP",
                    "B-ARG2",
                    "I-ARG2",
                    "O",
                    "B-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "O",
                ],
            ),
            (
                "is",
                [
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "B-ARG1",
                    "B-V",
                    "B-ARG2",
                    "I-ARG2",
                    "I-ARG2",
                    "O",
                ],
            ),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
        ]
        assert annotation.speakers == [
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))"
        )
        assert annotation.coref_spans == {(14, (6, 6))}
Ejemplo n.º 24
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
Ejemplo n.º 25
0
                brat_span_pairs[(predicate, arg_key)] = arg_label

    return ' '.join(tokens), brat_spans, brat_span_pairs


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        'convert conll 2012 format into brat format')
    parser.add_argument('--inp', type=str, required=True, help='input dir')
    parser.add_argument('--out', type=str, required=True, help='output dir')
    parser.add_argument('--merge',
                        action='store_true',
                        help='merge adjacent same sentences')
    args = parser.parse_args()

    ontonotes_reader = Ontonotes()

    print(
        'reading OpenIE instances from dataset files at: {}. The same sentences must be successive'
        .format(args.inp))

    def doc_iter(
        n_sent
    ):  # treat every n_sent sentence as a document for OpenIE to reduce the number of files
        doc: List[OntonotesSentence] = []
        for conll_file in ontonotes_reader.dataset_path_iterator(args.inp):
            for sent in ontonotes_reader.sentence_iterator(conll_file):
                same_as_last = False
                if args.merge and len(doc) > 0 and ' '.join(
                        sent.words) == ' '.join(doc[-1].words):
                    same_as_last = True
Ejemplo n.º 26
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = 'tests/fixtures/coref/coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
    def _read_dataset(self,
                      file_path: str,
                      count_only: bool = False,
                      keep_idx: Optional[Set[int]] = None):
        """
        Yield instances from the file_path.

        Parameters
        ----------
        file_path: str, required
            The path to the data file.
        count_only: bool, optional (default=``False``)
            If True, no instances are returned and instead a dummy object is
            returned. This is useful for quickly counting the number of instances
            in the data file, since creating instances is relatively expensive.
        keep_idx: Set[int], optional (default=``None``)
            If not None, only yield instances whose index is in this set.
        """
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        # Reseed for reproducibility
        self._reseed(seed=self._seed)

        index = 0
        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            text_sentences: List[List[str]] = [s.words for s in sentences]
            flattened_text_sentences: List[str] = [
                self._normalize_word(word) for text_sentence in text_sentences
                for word in text_sentence
            ]
            sentence_arc_indices: List[Tuple[int, int]] = []
            sentence_labels: List[str] = []

            # Filter the clusters to only have single-token entities
            # TODO(nfliu): How do we handle spans here?
            filtered_clusters = filter_clusters(canonical_clusters,
                                                max_span_size=1)

            # Check if there are at least two clusters, each of which has at least 2 different items.
            # If not, then skip creating examples from this passage.
            counter = 0
            all_cluster_words = []
            all_cluster_unique_words = []
            for cluster in filtered_clusters:
                # Get the words that show up in the cluster
                cluster_words = list(
                    tuple(flattened_text_sentences[index]
                          for index in range(item[0], item[1] + 1))
                    for item in cluster)
                all_cluster_words.append(cluster_words)

                cluster_unique_words = set(cluster_words)
                all_cluster_unique_words.append(cluster_unique_words)
                if len(set(cluster_words)) >= 2:
                    counter += 1
            if counter < 2:
                continue

            if keep_idx is not None and index not in keep_idx:
                index += 1
                continue
            if count_only:
                yield 1
                continue

            # Contextualize the tokens if a Contextualizer was provided.
            # TODO (nfliu): How can we make this batched?
            # Would make contextualizers that use the GPU much faster.
            if self._contextualizer:
                token_representations = self._contextualizer(
                    [flattened_text_sentences])[0]
            else:
                token_representations = None

            # For each cluster with 2+ different items, make positive examples between each of the different items
            # that are different strings and make negative examples between each of the different items and a
            # random token from another cluster.
            assert ((len(filtered_clusters) == len(all_cluster_words)) &
                    (len(all_cluster_words) == len(all_cluster_unique_words)))

            for cluster_index, (cluster_spans, cluster_words,
                                cluster_unique_words) in enumerate(
                                    zip(filtered_clusters, all_cluster_words,
                                        all_cluster_unique_words)):
                # Don't make examples from this if there is only 1 unique item.
                if len(cluster_unique_words) < 2:
                    continue
                # Get all combinations of cluster spans (a, b), where a occurs
                # in the text before b.
                all_coreferring_spans = []
                for parent_cluster_span in cluster_spans:
                    for child_cluster_span in cluster_spans:
                        # Skip child_cluster_span if it occurs before the parent_span.
                        # TODO (nfliu): this is single-word specific
                        if child_cluster_span[0] < parent_cluster_span[0]:
                            continue

                        # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical
                        if (flattened_text_sentences[
                                child_cluster_span[0]:child_cluster_span[1] +
                                1] == flattened_text_sentences[
                                    parent_cluster_span[0]:
                                    parent_cluster_span[1] + 1]):
                            continue
                        # Add to the set of coreference candidates
                        all_coreferring_spans.append(
                            (child_cluster_span, parent_cluster_span))

                # Take the coreference_candidates and generate positive and negative examples
                for (child_span, parent_span) in all_coreferring_spans:
                    # TODO (nfliu): This is single-word specific, will have to change
                    # if we generalize to spans
                    sentence_arc_indices.append(
                        (child_span[0], parent_span[0]))
                    sentence_labels.append("1")

                    # Generate a negative example for the child.
                    other_clusters = [
                        cluster for i, cluster in enumerate(filtered_clusters)
                        if i != cluster_index
                    ]
                    negative_coreferent = self._sample_negative_coreferent(
                        other_clusters, child_span[0])
                    if negative_coreferent:
                        sentence_arc_indices.append(
                            (child_span[0], negative_coreferent[0]))
                        sentence_labels.append("0")
            yield self.text_to_instance(
                tokens=flattened_text_sentences,
                arc_indices=sentence_arc_indices,
                token_representations=token_representations,
                labels=sentence_labels)
            index += 1
Ejemplo n.º 28
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s",
            'confession', 'was', 'forced', '.'
        ]
        assert annotation.pos_tags == [
            'NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ',
            '.'
        ]
        assert annotation.word_senses == [
            None, None, 1, 1, None, 2, None, None, 1, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, '01', None, None
        ]
        assert annotation.srl_frames == [("say", [
            'B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1',
            'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'
        ]),
                                         ("was", [
                                             'O', 'O', 'O', 'O', 'B-ARG1',
                                             'I-ARG1', 'I-ARG1', 'I-ARG1',
                                             'B-V', 'B-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, 'official', 'say', None, 'man', None, None, 'be', None,
            None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
            'after', 'four', 'months', 'of', 'hearings', '.'
        ]
        assert annotation.pos_tags == [
            'DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS',
            'IN', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, '01', None, None, None, None, None, None, None, None,
            '01', None
        ]
        assert annotation.srl_frames == [('rested', [
            'B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP',
            'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
            'I-ARGM-TMP', 'I-ARGM-TMP', 'O'
        ]),
                                         ('hearings', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'B-V', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE',
            'I-DATE', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, 'prosecution', 'rest', None, 'case', None, None, None, None,
            'month', None, 'hearing', None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None,
            None, None
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Denise', 'Dillon', 'Headline', 'News', '.'
        ]
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [
            None, None, None, None, None
        ]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            'B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == 'test/test/04/test_004'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as',
            'it', 'is', 'with', 'all', 'children', '.'
        ]
        assert annotation.pos_tags == [
            'CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP',
            'VBZ', 'IN', 'DT', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, None, None, 4.0, None, None, None, None, None, None, 5.0,
            None, None, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, None, None, '01',
            None, None, None, None
        ]
        assert annotation.srl_frames == [('is', [
            'B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2',
            'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV',
            'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O'
        ]),
                                         ('is', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'B-ARG1', 'B-V',
                                             'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
            'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, None, 'be', None, None, None, None, None, None, 'be',
            None, None, None, None
        ]
        assert annotation.speakers == [
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_'
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
Ejemplo n.º 29
0
 def __init__(self):
     self.onto_reader = Ontonotes()