def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) new_sentences = [s.words for s in sentences] flattened_sentences = [ self._normalize_word(word) for sentence in new_sentences for word in sentence ] def tokenizer(s: str): return self.token_indexer.wordpiece_tokenizer(s) flattened_sentences = tokenizer(" ".join(flattened_sentences)) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator('tests/fixtures/conll_2012/')) expected_paths = ['tests/fixtures/conll_2012/subdomain/example.gold_conll', 'tests/fixtures/conll_2012/subdomain2/example.gold_conll'] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) for sentence in ontonotes_reader.dataset_iterator(file_path): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] instances.append( self.text_to_instance(tokens, verb_label, tags)) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] instances.append( self.text_to_instance(tokens, verb_indicator, tags)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def main(args): import argparse parser = argparse.ArgumentParser() parser.add_argument( "--ontonotes", type=str, required=True, help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0", ) parser.add_argument("--tasks", type=str, nargs="+", help="Tasks, one or more of {const, coref, ner, srl}.") parser.add_argument( "--splits", type=str, nargs="+", default=["train", "development", "test", "conll-2012-test"], help= "Splits, one or more of {train, development, test, conll-2012-test}.", ) parser.add_argument("-o", dest="output_dir", type=str, default=".", help="Output directory for JSON files.") args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) import pandas as pd pd.options.display.float_format = "{:.2f}".format # Load OntoNotes reader. ontonotes = Ontonotes() for split in args.splits: for task in args.tasks: source_path = os.path.join(args.ontonotes, "data", split) print('########### Reading ontonotes split from', source_path) ontonotes_reader = ontonotes.dataset_iterator( file_path=source_path) log.info("Processing split '%s' for task '%s'", split, task) task_dir = os.path.join(args.output_dir, task) if not os.path.isdir(task_dir): os.mkdir(task_dir) target_fname = os.path.join(task_dir, f"{split}.json") ontonotes_stats = collections.Counter() converted_records = process_task_split(tqdm(ontonotes_reader), task, ontonotes_stats) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format()) log.info(str(pd.Series(ontonotes_stats, dtype=object)))
def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / 'conll_2012')) expected_paths = [str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain' / 'example.gold_conll'), str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain2' / 'example.gold_conll')] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) instance = self.text_to_instance([s.words for s in sentences], canonical_clusters) instances.append(instance) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 speakers = [] for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) speakers.append(sentence.speakers) doc_key = sentences[0].document_id genre = self.genres[doc_key[:2]] speakers = self.flatten(speakers) assert total_tokens == len(speakers) speaker_dict = {s: i for i, s in enumerate(set(speakers))} speaker_ids = np.array([speaker_dict[s] for s in speakers]) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() i = 0 for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) percent_user_spans = 0.0 if self._simulate_user_inputs and i >= self._fully_labelled_threshold: percent_user_spans = 1.0 i += 1 yield self.text_to_instance([s.words for s in sentences], sentences[0].document_id, sentences[0].sentence_id, canonical_clusters, percent_user_spans)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list( reader.dataset_path_iterator('tests/fixtures/conll_2012/')) assert files == [ 'tests/fixtures/conll_2012/subdomain/example.gold_conll', 'tests/fixtures/conll_2012/subdomain2/example.gold_conll' ]
def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / "conll_2012")) expected_paths = [ str(self.FIXTURES_ROOT / "conll_2012" / "subdomain" / "example.gold_conll"), str(self.FIXTURES_ROOT / "conll_2012" / "subdomain2" / "example.gold_conll"), ] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list( reader.dataset_path_iterator('tests/fixtures/conll_2012/')) expected_paths = [ 'tests/fixtures/conll_2012/subdomain/example.gold_conll', 'tests/fixtures/conll_2012/subdomain2/example.gold_conll' ] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def _ontonotes_subset(ontonotes_reader: Ontonotes, file_path: str, domain_identifier: str) -> Iterable[OntonotesSentence]: """ Iterates over the Ontonotes 5.0 dataset using an optional domain identifier. If the domain identifier is present, only examples which contain the domain identifier in the file path are yielded. """ for conll_file in ontonotes_reader.dataset_path_iterator(file_path): if (domain_identifier is None or f"/{domain_identifier}/" in conll_file) and "/pt/" not in conll_file: yield from ontonotes_reader.sentence_iterator(conll_file)
def _ontonotes_subset( ontonotes_reader: Ontonotes, file_path: str, domain_identifier: str) -> Iterable[OntonotesSentence]: """ Iterates over the Ontonotes 5.0 dataset using an optional domain identifier. If the domain identifier is present, only examples which contain the domain identifier in the file path are yielded. """ for conll_file in ontonotes_reader.dataset_path_iterator(file_path): if domain_identifier is None or f"/{domain_identifier}/" in conll_file: yield from ontonotes_reader.sentence_iterator(conll_file)
def _read(self, file_path: str): """OntoNotes custom reader to load spans from dependency pares tree as well""" # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): # skip samples without dep' parse tree if not sentence.parse_tree: continue # extract dep' parse tree spans spans = set() for subtree in sentence.parse_tree.subtrees(): if subtree.height() > 0: # TODO: check how to output indices instead of words # (for extreme cases where different tuples could match) spans.add(tuple(subtree.leaves())) tokens = [Token(t) for t in sentence.words] if sentence.srl_frames: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance_with_spans( tokens, verb_indicator, tags, spans)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): pos_tags = [t for t in sentence.pos_tags] tokens = [ Token(t, None, None, pos_tags[i]) for i, t in enumerate(sentence.words) ] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] verb_indices = np.where(np.array(verb_indicator) == 1)[0] if len(verb_indices) > 0: verb_index = int(verb_indices[0]) verb = tokens[verb_index] else: verb_index = -1 verb = '' for i, tag in enumerate(tags): if tag[0] == 'B': tags[i] = tags[i].replace('B', 'I', 1) if self.used_tags is not None and tags[ i] not in self.used_tags: tags[i] = 'O' instance = self.text_to_instance([verb] + tokens, [0] + verb_indicator, ['O'] + tags) if self.dependency_parse: doc = self.nlp(' '.join(sentence.words)) instance.add_field('dependency', MetadataField(doc)) instance.add_field( 'verb_index', IndexField(verb_index, instance['tokens'])) yield instance
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] # for i in range(len(tags)): # if tags[i] != 'O': # tags[i] = 'I-ARG1' yield self.text_to_instance(tokens, verb_indicator, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) for sentence in ontonotes_reader.dataset_iterator(file_path): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags] yield self.text_to_instance(tokens, verb_indicator, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache #Pdb().set_trace() data_split = os.path.basename(os.path.normpath(file_path)) file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) # Set random seed if percent is not 100 if (self.percent_data < 100): random.seed(self.random_data_seed) # Write sentence, parse tree, span matrix to file # fout = open(f"srl_spans_{data_split}.pkl", "wb") print(f"return_labels: {self.return_labels}") for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): if (self.percent_data < 100 and data_split == "train"): select_data = random.randint(1, 101) if (select_data > self.percent_data): continue tokens = [Token(t) for t in sentence.words] parseTree = sentence.parse_tree # Convert tree to span list if not sentence.srl_frames: # Sentence contains no predicates. verb_label = [0 for _ in tokens] if self.return_labels: tags = ["O" for _ in tokens] yield self.text_to_instance(tokens, verb_label, parseTree, tags) else: yield self.text_to_instance(tokens, verb_label, parseTree, None) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] if self.return_labels: yield self.text_to_instance(tokens, verb_indicator, parseTree, tags) else: yield self.text_to_instance(tokens, verb_indicator, parseTree, None)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] ########################## result = self.dependency_tree_predictor.predict( sentence=" ".join(sentence.words)) # print(result['words']) root_dict = result['hierplane_tree']['root'] adj = {} self.traverse_tree(adj, root_dict['word'], root_dict) predicte_adj = {} ######################### if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, adj, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] ############################################# verb_index = verb_indicator.index(1) predicte = sentence.words[verb_index] if predicte in adj: predicte_adj[predicte] = adj[predicte] # 这里可能会造成死循环 for i in predicte_adj[predicte]: if i in adj: for j in adj[i]: predicte_adj[predicte].append(j) yield self.text_to_instance(tokens, verb_indicator, predicte_adj, tags) else: # print(" ".join(sentence.words)) # print(adj) yield self.text_to_instance(tokens, verb_indicator, adj, tags)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info( "Reading Fine-Grained NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(_normalize_word(t)) for t in sentence.words] yield self.text_to_instance(tokens, sentence.named_entities)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] ########################## result = self.dependency_tree_predictor.predict( sentence=" ".join(sentence.words)) predicted_heads = result["predicted_heads"] ######################### if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, adj, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] verb_index = verb_indicator.index(1) # ############################################# adj = {} self.traverse_predicted_heads(adj, predicted_heads, verb_index + 1) # 有些动词没有关系,防止在后面listfield中出错 adj[verb_index + 1].append(verb_index + 1) ############################################## # verb_index = verb_indicator.index(1) # for i in range(len(tags)): # if '0' in tags[i]: # tags[i] = 'B-ARG0' # elif tags[i] != 'O' and i != verb_index: # tags[i] = 'B-ARG1' yield self.text_to_instance(tokens, verb_indicator, adj, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info( "Reading SRL instances along with constituent parse from data files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] parse = sentence.parse_tree if parse: pos_tags = [x[1] for x in parse.pos()] # yield self.text_to_instance(parse.leaves(), [x[1] for x in parse.pos()], parse) else: # parse information is missing for this sentence parse = None pos_tags = None if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags, pos_tags, parse) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance(tokens, verb_indicator, tags, pos_tags, parse)
def _read(self, file_path: str): file_path = cached_path( file_path) # if `file_path` is a URL, redirect to the cache ontonotes_reader = Ontonotes() logger.info("Reading NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.named_entities: tags = ["O" for _ in tokens] else: tags = sentence.named_entities if self._coding_scheme == "BIOUL": tags = iob1_to_bioul(tags) yield self.text_to_instance(tokens, tags)
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list( reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/')) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == [ 'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s", 'confession', 'was', 'forced', '.' ] assert annotation.pos_tags == [ 'NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ', '.' ] assert annotation.word_senses == [ None, None, 1, 1, None, 2, None, None, 1, None, None ] assert annotation.predicate_framenet_ids == [ None, None, None, '01', None, None, None, None, '01', None, None ] assert annotation.srl_frames == [("say", [ 'B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O' ]), ("was", [ 'O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O' ])] assert annotation.named_entities == [ 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, None, 'official', 'say', None, 'man', None, None, 'be', None, None ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == [ 'The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.' ] assert annotation.pos_tags == [ 'DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS', 'IN', 'NNS', '.' ] assert annotation.word_senses == [ None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None ] assert annotation.predicate_framenet_ids == [ None, None, '01', None, None, None, None, None, None, None, None, '01', None ] assert annotation.srl_frames == [('rested', [ 'B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O' ]), ('hearings', [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O' ])] assert annotation.named_entities == [ 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, 'prosecution', 'rest', None, 'case', None, None, None, None, 'month', None, 'hearing', None ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, None, None ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} # Check we can handle sentences without verbs. annotation = annotated_sentences[2] assert annotation.document_id == 'test/test/03/test_003' assert annotation.sentence_id == 0 assert annotation.words == [ 'Denise', 'Dillon', 'Headline', 'News', '.' ] assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.'] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [ None, None, None, None, None ] assert annotation.srl_frames == [] assert annotation.named_entities == [ 'B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O' ] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring( "(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))} # Check we can handle sentences with 2 identical verbs. annotation = annotated_sentences[3] assert annotation.document_id == 'test/test/04/test_004' assert annotation.sentence_id == 0 assert annotation.words == [ 'and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as', 'it', 'is', 'with', 'all', 'children', '.' ] assert annotation.pos_tags == [ 'CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NNS', '.' ] assert annotation.word_senses == [ None, None, None, 4.0, None, None, None, None, None, None, 5.0, None, None, None, None ] assert annotation.predicate_framenet_ids == [ None, None, None, '01', None, None, None, None, None, None, '01', None, None, None, None ] assert annotation.srl_frames == [('is', [ 'B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2', 'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O' ]), ('is', [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ARG1', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O' ])] assert annotation.named_entities == [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, None, None, 'be', None, None, None, None, None, None, 'be', None, None, None, None ] assert annotation.speakers == [ '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_' ] assert annotation.parse_tree == Tree.fromstring( "(TOP (S (CC and) (NP (DT that) (NN wildness)) " "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP " "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) " "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS " "children))))))) (. .)))") assert annotation.coref_spans == {(14, (6, 6))}
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = 'tests/fixtures/coref/coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
def _ontonotes_subset( ontonotes_reader: Ontonotes, file_path: str, domain_identifier: str) -> Iterable[OntonotesSentence]: for conll_file in ontonotes_reader.dataset_path_iterator(file_path): yield from ontonotes_reader.sentence_iterator(conll_file)
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = 'tests/fixtures/coref/coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/')) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman', "'s", 'confession', 'was', 'forced', '.'] assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ', '.'] assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None] assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None, None, None, '01', None, None] assert annotation.srl_frames == [("say", ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O']), ("was", ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O'])] assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, None, 'official', 'say', None, 'man', None, None, 'be', None, None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS', 'IN', 'NNS', '.'] assert annotation.word_senses == [None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None] assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None, None, None, None, None, None, '01', None] assert annotation.srl_frames == [('rested', ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O']), ('hearings', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O'])] assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case', None, None, None, None, 'month', None, 'hearing', None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} # Check we can handle sentences without verbs. annotation = annotated_sentences[2] assert annotation.document_id == 'test/test/03/test_003' assert annotation.sentence_id == 0 assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.'] assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.'] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [None, None, None, None, None] assert annotation.srl_frames == [] assert annotation.named_entities == ['B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))} # Check we can handle sentences with 2 identical verbs. annotation = annotated_sentences[3] assert annotation.document_id == 'test/test/04/test_004' assert annotation.sentence_id == 0 assert annotation.words == ['and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as', 'it', 'is', 'with', 'all', 'children', '.'] assert annotation.pos_tags == ['CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NNS', '.'] assert annotation.word_senses == [None, None, None, 4.0, None, None, None, None, None, None, 5.0, None, None, None, None] assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None, None, None, None, None, '01', None, None, None, None] assert annotation.srl_frames == [('is', ['B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2', 'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O']), ('is', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ARG1', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'])] assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, None, None, 'be', None, None, None, None, None, None, 'be', None, None, None, None] assert annotation.speakers == ['_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_'] assert annotation.parse_tree == Tree.fromstring("(TOP (S (CC and) (NP (DT that) (NN wildness)) " "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP " "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) " "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS " "children))))))) (. .)))") assert annotation.coref_spans == {(14, (6, 6))}
j_ind = spans[cluster[j]] span_pairs.add((j_ind, i_ind)) return doc_str, spans, span_pairs if __name__ == '__main__': parser = argparse.ArgumentParser( 'convert conll 2012 format into brat format') parser.add_argument('--inp', type=str, required=True, help='input dir') parser.add_argument('--out', type=str, required=True, help='output dir') args = parser.parse_args() print('reading coref instances from dataset files at: {}'.format(args.inp)) avg_cluster_size = [] ontonotes_reader = Ontonotes() for docid, doc in tqdm( enumerate(ontonotes_reader.dataset_document_iterator(args.inp))): docid += 1 clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) total_tokens = 0 for sentence in doc: for typed_span in sentence.coref_spans: span_id, (start, end) = typed_span # both start and end are inclusive clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters)
def _read_dataset(self, file_path: str, count_only: bool = False, keep_idx: Optional[Set[int]] = None): """ Yield instances from the file_path. Parameters ---------- file_path: str, required The path to the data file. count_only: bool, optional (default=``False``) If True, no instances are returned and instead a dummy object is returned. This is useful for quickly counting the number of instances in the data file, since creating instances is relatively expensive. keep_idx: Set[int], optional (default=``None``) If not None, only yield instances whose index is in this set. """ # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) # Reseed for reproducibility self._reseed(seed=self._seed) index = 0 ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) text_sentences: List[List[str]] = [s.words for s in sentences] flattened_text_sentences: List[str] = [ self._normalize_word(word) for text_sentence in text_sentences for word in text_sentence ] sentence_arc_indices: List[Tuple[int, int]] = [] sentence_labels: List[str] = [] # Filter the clusters to only have single-token entities # TODO(nfliu): How do we handle spans here? filtered_clusters = filter_clusters(canonical_clusters, max_span_size=1) # Check if there are at least two clusters, each of which has at least 2 different items. # If not, then skip creating examples from this passage. counter = 0 all_cluster_words = [] all_cluster_unique_words = [] for cluster in filtered_clusters: # Get the words that show up in the cluster cluster_words = list( tuple(flattened_text_sentences[index] for index in range(item[0], item[1] + 1)) for item in cluster) all_cluster_words.append(cluster_words) cluster_unique_words = set(cluster_words) all_cluster_unique_words.append(cluster_unique_words) if len(set(cluster_words)) >= 2: counter += 1 if counter < 2: continue if keep_idx is not None and index not in keep_idx: index += 1 continue if count_only: yield 1 continue # Contextualize the tokens if a Contextualizer was provided. # TODO (nfliu): How can we make this batched? # Would make contextualizers that use the GPU much faster. if self._contextualizer: token_representations = self._contextualizer( [flattened_text_sentences])[0] else: token_representations = None # For each cluster with 2+ different items, make positive examples between each of the different items # that are different strings and make negative examples between each of the different items and a # random token from another cluster. assert ((len(filtered_clusters) == len(all_cluster_words)) & (len(all_cluster_words) == len(all_cluster_unique_words))) for cluster_index, (cluster_spans, cluster_words, cluster_unique_words) in enumerate( zip(filtered_clusters, all_cluster_words, all_cluster_unique_words)): # Don't make examples from this if there is only 1 unique item. if len(cluster_unique_words) < 2: continue # Get all combinations of cluster spans (a, b), where a occurs # in the text before b. all_coreferring_spans = [] for parent_cluster_span in cluster_spans: for child_cluster_span in cluster_spans: # Skip child_cluster_span if it occurs before the parent_span. # TODO (nfliu): this is single-word specific if child_cluster_span[0] < parent_cluster_span[0]: continue # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical if (flattened_text_sentences[ child_cluster_span[0]:child_cluster_span[1] + 1] == flattened_text_sentences[ parent_cluster_span[0]: parent_cluster_span[1] + 1]): continue # Add to the set of coreference candidates all_coreferring_spans.append( (child_cluster_span, parent_cluster_span)) # Take the coreference_candidates and generate positive and negative examples for (child_span, parent_span) in all_coreferring_spans: # TODO (nfliu): This is single-word specific, will have to change # if we generalize to spans sentence_arc_indices.append( (child_span[0], parent_span[0])) sentence_labels.append("1") # Generate a negative example for the child. other_clusters = [ cluster for i, cluster in enumerate(filtered_clusters) if i != cluster_index ] negative_coreferent = self._sample_negative_coreferent( other_clusters, child_span[0]) if negative_coreferent: sentence_arc_indices.append( (child_span[0], negative_coreferent[0])) sentence_labels.append("0") yield self.text_to_instance( tokens=flattened_text_sentences, arc_indices=sentence_arc_indices, token_representations=token_representations, labels=sentence_labels) index += 1
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/')) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman', "'s", 'confession', 'was', 'forced', '.'] assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ', '.'] assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None] assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None, None, None, '01', None, None] assert annotation.srl_frames == {"say": ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'], "was": ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O']} assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, None, 'official', 'say', None, 'man', None, None, 'be', None, None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS', 'IN', 'NNS', '.'] assert annotation.word_senses == [None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None] assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None, None, None, None, None, None, '01', None] assert annotation.srl_frames == {'rested': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O'], 'hearings': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O']} assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case', None, None, None, None, 'month', None, 'hearing', None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} annotation = annotated_sentences[2] assert annotation.document_id == 'test/test/03/test_003' assert annotation.sentence_id == 0 assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.'] assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.'] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [None, None, None, None, None] assert annotation.srl_frames == {} assert annotation.named_entities == ['B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))}
brat_span_pairs[(predicate, arg_key)] = arg_label return ' '.join(tokens), brat_spans, brat_span_pairs if __name__ == '__main__': parser = argparse.ArgumentParser( 'convert conll 2012 format into brat format') parser.add_argument('--inp', type=str, required=True, help='input dir') parser.add_argument('--out', type=str, required=True, help='output dir') parser.add_argument('--merge', action='store_true', help='merge adjacent same sentences') args = parser.parse_args() ontonotes_reader = Ontonotes() print( 'reading OpenIE instances from dataset files at: {}. The same sentences must be successive' .format(args.inp)) def doc_iter( n_sent ): # treat every n_sent sentence as a document for OpenIE to reduce the number of files doc: List[OntonotesSentence] = [] for conll_file in ontonotes_reader.dataset_path_iterator(args.inp): for sent in ontonotes_reader.sentence_iterator(conll_file): same_as_last = False if args.merge and len(doc) > 0 and ' '.join( sent.words) == ' '.join(doc[-1].words): same_as_last = True