예제 #1
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     expected_paths = ['tests/fixtures/conll_2012/subdomain/example.gold_conll',
                       'tests/fixtures/conll_2012/subdomain2/example.gold_conll']
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
예제 #2
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / 'conll_2012'))
     expected_paths = [str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain' / 'example.gold_conll'),
                       str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain2' / 'example.gold_conll')]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
예제 #3
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        ontonotes_reader = Ontonotes()
        for document in ontonotes_reader.dataset_path_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)
            sentences = [
                s for s in ontonotes_reader.sentence_iterator(document)
            ]

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            instance = self.text_to_instance([s.words for s in sentences],
                                             canonical_clusters)
            instances.append(instance)

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
예제 #4
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     assert files == [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
예제 #5
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / "conll_2012"))
     expected_paths = [
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain" / "example.gold_conll"),
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain2" / "example.gold_conll"),
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
예제 #6
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     expected_paths = [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
예제 #7
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)
예제 #8
0
 def _ontonotes_subset(ontonotes_reader: Ontonotes,
                       file_path: str,
                       domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if (domain_identifier is None or f"/{domain_identifier}/" in conll_file) and "/pt/" not in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)
예제 #9
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         yield from ontonotes_reader.sentence_iterator(conll_file)