def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator(CONLL_PATH)) expected_paths = [ str(CONLL_PATH / "subdomain" / "example.gold_conll"), str(CONLL_PATH / "subdomain2" / "example.gold_conll"), ] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def _ontonotes_subset( ontonotes_reader: Ontonotes, file_path: str, domain_identifier: str) -> Iterable[OntonotesSentence]: """ Iterates over the Ontonotes 5.0 dataset using an optional domain identifier. If the domain identifier is present, only examples which contain the domain identifier in the file path are yielded. """ for conll_file in ontonotes_reader.dataset_path_iterator(file_path): if domain_identifier is None or f"/{domain_identifier}/" in conll_file: yield from ontonotes_reader.sentence_iterator(conll_file)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): pos_tags = [t for t in sentence.pos_tags] tokens = [ Token(t, None, None, pos_tags[i]) for i, t in enumerate(sentence.words) ] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] verb_indices = np.where(np.array(verb_indicator) == 1)[0] if len(verb_indices) > 0: verb_index = int(verb_indices[0]) verb = tokens[verb_index] else: verb_index = -1 verb = '' for i, tag in enumerate(tags): if tag[0] == 'B': tags[i] = tags[i].replace('B', 'I', 1) if self.used_tags is not None and tags[ i] not in self.used_tags: tags[i] = 'O' instance = self.text_to_instance([verb] + tokens, [0] + verb_indicator, ['O'] + tags) if self.dependency_parse: doc = self.nlp(' '.join(sentence.words)) instance.add_field('dependency', MetadataField(doc)) instance.add_field( 'verb_index', IndexField(verb_index, instance['tokens'])) yield instance
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier, ) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance(tokens, verb_indicator, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info( "Reading Fine-Grained NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier, ) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(_normalize_word(t)) for t in sentence.words] yield self.text_to_instance(tokens, sentence.named_entities)
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = FIXTURES_ROOT / "coref" / "coref.gold_conll" documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 4
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list( reader.dataset_iterator(CONLL_PATH / "subdomain")) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == [ "Mali", "government", "officials", "say", "the", "woman", "'s", "confession", "was", "forced", ".", ] assert annotation.pos_tags == [ "NNP", "NN", "NNS", "VBP", "DT", "NN", "POS", "NN", "VBD", "JJ", ".", ] assert annotation.word_senses == [ None, None, 1, 1, None, 2, None, None, 1, None, None ] assert annotation.predicate_framenet_ids == [ None, None, None, "01", None, None, None, None, "01", None, None, ] assert annotation.srl_frames == [ ( "say", [ "B-ARG0", "I-ARG0", "I-ARG0", "B-V", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "O", ], ), ( "was", [ "O", "O", "O", "O", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "B-V", "B-ARG2", "O" ], ), ] assert annotation.named_entities == [ "B-GPE", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, None, "official", "say", None, "man", None, None, "be", None, None, ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == [ "The", "prosecution", "rested", "its", "case", "last", "month", "after", "four", "months", "of", "hearings", ".", ] assert annotation.pos_tags == [ "DT", "NN", "VBD", "PRP$", "NN", "JJ", "NN", "IN", "CD", "NNS", "IN", "NNS", ".", ] assert annotation.word_senses == [ None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None, ] assert annotation.predicate_framenet_ids == [ None, None, "01", None, None, None, None, None, None, None, None, "01", None, ] assert annotation.srl_frames == [ ( "rested", [ "B-ARG0", "I-ARG0", "B-V", "B-ARG1", "I-ARG1", "B-ARGM-TMP", "I-ARGM-TMP", "B-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "O", ], ), ("hearings", [ "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-V", "O" ]), ] assert annotation.named_entities == [ "O", "O", "O", "O", "O", "B-DATE", "I-DATE", "O", "B-DATE", "I-DATE", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, "prosecution", "rest", None, "case", None, None, None, None, "month", None, "hearing", None, ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, None, None, ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} # Check we can handle sentences without verbs. annotation = annotated_sentences[2] assert annotation.document_id == "test/test/03/test_003" assert annotation.sentence_id == 0 assert annotation.words == [ "Denise", "Dillon", "Headline", "News", "." ] assert annotation.pos_tags == ["NNP", "NNP", "NNP", "NNP", "."] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [ None, None, None, None, None ] assert annotation.srl_frames == [] assert annotation.named_entities == [ "B-PERSON", "I-PERSON", "B-WORK_OF_ART", "I-WORK_OF_ART", "O", ] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring( "(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))} # Check we can handle sentences with 2 identical verbs. annotation = annotated_sentences[3] assert annotation.document_id == "test/test/04/test_004" assert annotation.sentence_id == 0 assert annotation.words == [ "and", "that", "wildness", "is", "still", "in", "him", ",", "as", "it", "is", "with", "all", "children", ".", ] assert annotation.pos_tags == [ "CC", "DT", "NN", "VBZ", "RB", "IN", "PRP", ",", "IN", "PRP", "VBZ", "IN", "DT", "NNS", ".", ] assert annotation.word_senses == [ None, None, None, 4.0, None, None, None, None, None, None, 5.0, None, None, None, None, ] assert annotation.predicate_framenet_ids == [ None, None, None, "01", None, None, None, None, None, None, "01", None, None, None, None, ] assert annotation.srl_frames == [ ( "is", [ "B-ARGM-DIS", "B-ARG1", "I-ARG1", "B-V", "B-ARGM-TMP", "B-ARG2", "I-ARG2", "O", "B-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "O", ], ), ( "is", [ "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-ARG1", "B-V", "B-ARG2", "I-ARG2", "I-ARG2", "O", ], ), ] assert annotation.named_entities == [ "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, None, None, "be", None, None, None, None, None, None, "be", None, None, None, None, ] assert annotation.speakers == [ "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", ] assert annotation.parse_tree == Tree.fromstring( "(TOP (S (CC and) (NP (DT that) (NN wildness)) " "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP " "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) " "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS " "children))))))) (. .)))") assert annotation.coref_spans == {(14, (6, 6))}
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier, ) count = 0 instances = [] for index, sentence in enumerate( self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier)): if self._limit > 0 and count >= self._limit and not self._random_sample: break tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] count += 1 instance = self.text_to_instance(tokens, verb_label, tags) if self._random_sample and self._limit > 0: instances.append(instance) else: yield instance else: for (_, tags) in sentence.srl_frames: if self._limit > 0 and count >= self._limit and not self._random_sample: break count += 1 verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] if self._print_violations: violation = False counts = defaultdict(int) for t in tags: counts[t] += 1 for key in list(counts.keys()): if key[:4] in {"B-R-", "B-C-"}: if counts["B-" + key[4:]] == 0: violation = True break if violation: logger.info(tokens) logger.info(tags) instance = self.text_to_instance(tokens, verb_indicator, tags) if self._random_sample and self._limit > 0: instances.append(instance) else: yield instance if self._random_sample and self._limit > 0: random.seed(self._random_seed) sample = random.sample(instances, self._limit) for instance in sample: yield instance
def _read(self, file_paths: str): read_from_pickle = False if self._pickle_path is not None and not self._test_run: if os.path.exists(self._pickle_path): read_from_pickle = True f = open(self._pickle_path, 'rb') instances = pickle.load(f) f.close() for instance in instances: yield instance if not read_from_pickle: file_paths = file_paths.split(",") for file_path in file_paths: if "parallel" in file_path: f = open(file_path) lines = f.readlines() f.close() for i in range(len(lines) // 2): if self._limit > 0 and i >= self._limit: break sentence1 = [lines[2 * i].strip().split()] if self._parallel_tokenizer is None: # sentence2 = self._parallel_stanza(lines[2*i+1].strip()) # sentence2 = [[token["text"] for token in sentence] for sentence in sentence2.to_dict()] assert self._parallel_jieba sentence2 = [[ token[0] for token in jieba.tokenize(lines[2 * i + 1].strip()) ]] else: sentence2 = [ self._parallel_tokenizer.tokenize( lines[2 * i + 1].strip()) ] if self._parallel_reverse: tmp = sentence1 sentence1 = sentence2 sentence2 = tmp instance = self.text_to_instance( sentences=sentence1, document_id=file_path + "_" + str(i), language="parallel", parallel_sentences=sentence2) yield instance else: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) language = file_path.split(".")[-2] ontonotes_reader = Ontonotes(multiple_tags=True) instances = [] for sentences in ontonotes_reader.dataset_document_iterator( file_path): if self._limit > 0 and len(instances) >= self._limit: break document_id = sentences[0].document_id + "_" + str( sentences[0].sentence_id) if self._individual_sentences: for sentence in sentences: clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences( [sentence]) instance = self.text_to_instance( sentences=[sentence.words], document_id=document_id, gold_clusters=list(clusters.values()), srl_frames=srl_frames, named_entities=named_entities, language=language, sentence_objects=[sentence], named_entity_spans=named_entity_spans) if instance is not None and ("srl_labels" in instance.fields or not self._srl): instances.append(instance) yield instance else: clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences( sentences) instance = self.text_to_instance( sentences=[s.words for s in sentences], document_id=document_id, gold_clusters=list(clusters.values()), srl_frames=srl_frames, named_entities=named_entities, language=language, sentence_objects=sentences, named_entity_spans=named_entity_spans) instances.append(instance) yield instance if self._test_run: break if not self._test_run and self._pickle_path is not None: f = open(self._pickle_path, 'wb') pickle.dump(instances, f) f.close()