Ejemplo n.º 1
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
Ejemplo n.º 2
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = FIXTURES_ROOT / "coref" / "coref.gold_conll"
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 4
Ejemplo n.º 3
0
    def _read(self, file_paths: str):
        read_from_pickle = False
        if self._pickle_path is not None and not self._test_run:
            if os.path.exists(self._pickle_path):
                read_from_pickle = True
                f = open(self._pickle_path, 'rb')
                instances = pickle.load(f)
                f.close()
                for instance in instances:
                    yield instance
        if not read_from_pickle:
            file_paths = file_paths.split(",")
            for file_path in file_paths:
                if "parallel" in file_path:
                    f = open(file_path)
                    lines = f.readlines()
                    f.close()
                    for i in range(len(lines) // 2):
                        if self._limit > 0 and i >= self._limit:
                            break
                        sentence1 = [lines[2 * i].strip().split()]
                        if self._parallel_tokenizer is None:
                            # sentence2 = self._parallel_stanza(lines[2*i+1].strip())
                            # sentence2 = [[token["text"] for token in sentence] for sentence in sentence2.to_dict()]
                            assert self._parallel_jieba
                            sentence2 = [[
                                token[0]
                                for token in jieba.tokenize(lines[2 * i +
                                                                  1].strip())
                            ]]
                        else:
                            sentence2 = [
                                self._parallel_tokenizer.tokenize(
                                    lines[2 * i + 1].strip())
                            ]
                        if self._parallel_reverse:
                            tmp = sentence1
                            sentence1 = sentence2
                            sentence2 = tmp
                        instance = self.text_to_instance(
                            sentences=sentence1,
                            document_id=file_path + "_" + str(i),
                            language="parallel",
                            parallel_sentences=sentence2)
                        yield instance
                else:
                    # if `file_path` is a URL, redirect to the cache
                    file_path = cached_path(file_path)
                    language = file_path.split(".")[-2]

                    ontonotes_reader = Ontonotes(multiple_tags=True)
                    instances = []
                    for sentences in ontonotes_reader.dataset_document_iterator(
                            file_path):
                        if self._limit > 0 and len(instances) >= self._limit:
                            break
                        document_id = sentences[0].document_id + "_" + str(
                            sentences[0].sentence_id)
                        if self._individual_sentences:
                            for sentence in sentences:
                                clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences(
                                    [sentence])
                                instance = self.text_to_instance(
                                    sentences=[sentence.words],
                                    document_id=document_id,
                                    gold_clusters=list(clusters.values()),
                                    srl_frames=srl_frames,
                                    named_entities=named_entities,
                                    language=language,
                                    sentence_objects=[sentence],
                                    named_entity_spans=named_entity_spans)
                                if instance is not None and ("srl_labels"
                                                             in instance.fields
                                                             or not self._srl):
                                    instances.append(instance)
                                    yield instance
                        else:
                            clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences(
                                sentences)
                            instance = self.text_to_instance(
                                sentences=[s.words for s in sentences],
                                document_id=document_id,
                                gold_clusters=list(clusters.values()),
                                srl_frames=srl_frames,
                                named_entities=named_entities,
                                language=language,
                                sentence_objects=sentences,
                                named_entity_spans=named_entity_spans)
                            instances.append(instance)
                            yield instance
                        if self._test_run:
                            break
                    if not self._test_run and self._pickle_path is not None:
                        f = open(self._pickle_path, 'wb')
                        pickle.dump(instances, f)
                        f.close()