def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) for sentence in ontonotes_reader.dataset_iterator(file_path): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] instances.append( self.text_to_instance(tokens, verb_label, tags)) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] instances.append( self.text_to_instance(tokens, verb_indicator, tags)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / 'conll_2012')) expected_paths = [str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain' / 'example.gold_conll'), str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain2' / 'example.gold_conll')] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() i = 0 for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) percent_user_spans = 0.0 if self._simulate_user_inputs and i >= self._fully_labelled_threshold: percent_user_spans = 1.0 i += 1 yield self.text_to_instance([s.words for s in sentences], sentences[0].document_id, sentences[0].sentence_id, canonical_clusters, percent_user_spans)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) instance = self.text_to_instance([s.words for s in sentences], canonical_clusters) instances.append(instance) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 speakers = [] for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) speakers.append(sentence.speakers) doc_key = sentences[0].document_id genre = self.genres[doc_key[:2]] speakers = self.flatten(speakers) assert total_tokens == len(speakers) speaker_dict = {s: i for i, s in enumerate(set(speakers))} speaker_ids = np.array([speaker_dict[s] for s in speakers]) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) new_sentences = [s.words for s in sentences] flattened_sentences = [ self._normalize_word(word) for sentence in new_sentences for word in sentence ] def tokenizer(s: str): return self.token_indexer.wordpiece_tokenizer(s) flattened_sentences = tokenizer(" ".join(flattened_sentences)) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] # for i in range(len(tags)): # if tags[i] != 'O': # tags[i] = 'I-ARG1' yield self.text_to_instance(tokens, verb_indicator, tags)
def main(args): import argparse parser = argparse.ArgumentParser() parser.add_argument( "--ontonotes", type=str, required=True, help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0", ) parser.add_argument("--tasks", type=str, nargs="+", help="Tasks, one or more of {const, coref, ner, srl}.") parser.add_argument( "--splits", type=str, nargs="+", default=["train", "development", "test", "conll-2012-test"], help= "Splits, one or more of {train, development, test, conll-2012-test}.", ) parser.add_argument("-o", dest="output_dir", type=str, default=".", help="Output directory for JSON files.") args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) import pandas as pd pd.options.display.float_format = "{:.2f}".format # Load OntoNotes reader. ontonotes = Ontonotes() for split in args.splits: for task in args.tasks: source_path = os.path.join(args.ontonotes, "data", split) print('########### Reading ontonotes split from', source_path) ontonotes_reader = ontonotes.dataset_iterator( file_path=source_path) log.info("Processing split '%s' for task '%s'", split, task) task_dir = os.path.join(args.output_dir, task) if not os.path.isdir(task_dir): os.mkdir(task_dir) target_fname = os.path.join(task_dir, f"{split}.json") ontonotes_stats = collections.Counter() converted_records = process_task_split(tqdm(ontonotes_reader), task, ontonotes_stats) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format()) log.info(str(pd.Series(ontonotes_stats, dtype=object)))
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): pos_tags = [t for t in sentence.pos_tags] tokens = [ Token(t, None, None, pos_tags[i]) for i, t in enumerate(sentence.words) ] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] verb_indices = np.where(np.array(verb_indicator) == 1)[0] if len(verb_indices) > 0: verb_index = int(verb_indices[0]) verb = tokens[verb_index] else: verb_index = -1 verb = '' for i, tag in enumerate(tags): if tag[0] == 'B': tags[i] = tags[i].replace('B', 'I', 1) if self.used_tags is not None and tags[ i] not in self.used_tags: tags[i] = 'O' instance = self.text_to_instance([verb] + tokens, [0] + verb_indicator, ['O'] + tags) if self.dependency_parse: doc = self.nlp(' '.join(sentence.words)) instance.add_field('dependency', MetadataField(doc)) instance.add_field( 'verb_index', IndexField(verb_index, instance['tokens'])) yield instance
def _read(self, file_path: str): """OntoNotes custom reader to load spans from dependency pares tree as well""" # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): # skip samples without dep' parse tree if not sentence.parse_tree: continue # extract dep' parse tree spans spans = set() for subtree in sentence.parse_tree.subtrees(): if subtree.height() > 0: # TODO: check how to output indices instead of words # (for extreme cases where different tuples could match) spans.add(tuple(subtree.leaves())) tokens = [Token(t) for t in sentence.words] if sentence.srl_frames: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance_with_spans( tokens, verb_indicator, tags, spans)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list( reader.dataset_path_iterator('tests/fixtures/conll_2012/')) assert files == [ 'tests/fixtures/conll_2012/subdomain/example.gold_conll', 'tests/fixtures/conll_2012/subdomain2/example.gold_conll' ]
def test_dataset_path_iterator(self): reader = Ontonotes() files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / "conll_2012")) expected_paths = [ str(self.FIXTURES_ROOT / "conll_2012" / "subdomain" / "example.gold_conll"), str(self.FIXTURES_ROOT / "conll_2012" / "subdomain2" / "example.gold_conll"), ] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def test_dataset_path_iterator(self): reader = Ontonotes() files = list( reader.dataset_path_iterator('tests/fixtures/conll_2012/')) expected_paths = [ 'tests/fixtures/conll_2012/subdomain/example.gold_conll', 'tests/fixtures/conll_2012/subdomain2/example.gold_conll' ] assert len(files) == len(expected_paths) assert set(files) == set(expected_paths)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache #Pdb().set_trace() data_split = os.path.basename(os.path.normpath(file_path)) file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) # Set random seed if percent is not 100 if (self.percent_data < 100): random.seed(self.random_data_seed) # Write sentence, parse tree, span matrix to file # fout = open(f"srl_spans_{data_split}.pkl", "wb") print(f"return_labels: {self.return_labels}") for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): if (self.percent_data < 100 and data_split == "train"): select_data = random.randint(1, 101) if (select_data > self.percent_data): continue tokens = [Token(t) for t in sentence.words] parseTree = sentence.parse_tree # Convert tree to span list if not sentence.srl_frames: # Sentence contains no predicates. verb_label = [0 for _ in tokens] if self.return_labels: tags = ["O" for _ in tokens] yield self.text_to_instance(tokens, verb_label, parseTree, tags) else: yield self.text_to_instance(tokens, verb_label, parseTree, None) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] if self.return_labels: yield self.text_to_instance(tokens, verb_indicator, parseTree, tags) else: yield self.text_to_instance(tokens, verb_indicator, parseTree, None)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] ########################## result = self.dependency_tree_predictor.predict( sentence=" ".join(sentence.words)) # print(result['words']) root_dict = result['hierplane_tree']['root'] adj = {} self.traverse_tree(adj, root_dict['word'], root_dict) predicte_adj = {} ######################### if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, adj, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] ############################################# verb_index = verb_indicator.index(1) predicte = sentence.words[verb_index] if predicte in adj: predicte_adj[predicte] = adj[predicte] # 这里可能会造成死循环 for i in predicte_adj[predicte]: if i in adj: for j in adj[i]: predicte_adj[predicte].append(j) yield self.text_to_instance(tokens, verb_indicator, predicte_adj, tags) else: # print(" ".join(sentence.words)) # print(adj) yield self.text_to_instance(tokens, verb_indicator, adj, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info( "Reading Fine-Grained NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(_normalize_word(t)) for t in sentence.words] yield self.text_to_instance(tokens, sentence.named_entities)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) for sentence in ontonotes_reader.dataset_iterator(file_path): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags] yield self.text_to_instance(tokens, verb_indicator, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] ########################## result = self.dependency_tree_predictor.predict( sentence=" ".join(sentence.words)) predicted_heads = result["predicted_heads"] ######################### if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, adj, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] verb_index = verb_indicator.index(1) # ############################################# adj = {} self.traverse_predicted_heads(adj, predicted_heads, verb_index + 1) # 有些动词没有关系,防止在后面listfield中出错 adj[verb_index + 1].append(verb_index + 1) ############################################## # verb_index = verb_indicator.index(1) # for i in range(len(tags)): # if '0' in tags[i]: # tags[i] = 'B-ARG0' # elif tags[i] != 'O' and i != verb_index: # tags[i] = 'B-ARG1' yield self.text_to_instance(tokens, verb_indicator, adj, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info( "Reading SRL instances along with constituent parse from data files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] parse = sentence.parse_tree if parse: pos_tags = [x[1] for x in parse.pos()] # yield self.text_to_instance(parse.leaves(), [x[1] for x in parse.pos()], parse) else: # parse information is missing for this sentence parse = None pos_tags = None if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags, pos_tags, parse) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance(tokens, verb_indicator, tags, pos_tags, parse)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path: str): file_path = cached_path( file_path) # if `file_path` is a URL, redirect to the cache ontonotes_reader = Ontonotes() logger.info("Reading NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.named_entities: tags = ["O" for _ in tokens] else: tags = sentence.named_entities if self._coding_scheme == "BIOUL": tags = iob1_to_bioul(tags) yield self.text_to_instance(tokens, tags)
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/')) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman', "'s", 'confession', 'was', 'forced', '.'] assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ', '.'] assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None] assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None, None, None, '01', None, None] assert annotation.srl_frames == {"say": ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'], "was": ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O']} assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, None, 'official', 'say', None, 'man', None, None, 'be', None, None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS', 'IN', 'NNS', '.'] assert annotation.word_senses == [None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None] assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None, None, None, None, None, None, '01', None] assert annotation.srl_frames == {'rested': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O'], 'hearings': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O']} assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O'] assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case', None, None, None, None, 'month', None, 'hearing', None] assert annotation.speakers == [None, None, None, None, None, None, None, None, None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} annotation = annotated_sentences[2] assert annotation.document_id == 'test/test/03/test_003' assert annotation.sentence_id == 0 assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.'] assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.'] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [None, None, None, None, None] assert annotation.srl_frames == {} assert annotation.named_entities == ['B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))}
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list( reader.dataset_iterator(self.FIXTURES_ROOT / "conll_2012" / "subdomain") ) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == [ "Mali", "government", "officials", "say", "the", "woman", "'s", "confession", "was", "forced", ".", ] assert annotation.pos_tags == [ "NNP", "NN", "NNS", "VBP", "DT", "NN", "POS", "NN", "VBD", "JJ", ".", ] assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None] assert annotation.predicate_framenet_ids == [ None, None, None, "01", None, None, None, None, "01", None, None, ] assert annotation.srl_frames == [ ( "say", [ "B-ARG0", "I-ARG0", "I-ARG0", "B-V", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "O", ], ), ( "was", ["O", "O", "O", "O", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1", "B-V", "B-ARG2", "O"], ), ] assert annotation.named_entities == [ "B-GPE", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, None, "official", "say", None, "man", None, None, "be", None, None, ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))" ) assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == [ "The", "prosecution", "rested", "its", "case", "last", "month", "after", "four", "months", "of", "hearings", ".", ] assert annotation.pos_tags == [ "DT", "NN", "VBD", "PRP$", "NN", "JJ", "NN", "IN", "CD", "NNS", "IN", "NNS", ".", ] assert annotation.word_senses == [ None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None, ] assert annotation.predicate_framenet_ids == [ None, None, "01", None, None, None, None, None, None, None, None, "01", None, ] assert annotation.srl_frames == [ ( "rested", [ "B-ARG0", "I-ARG0", "B-V", "B-ARG1", "I-ARG1", "B-ARGM-TMP", "I-ARGM-TMP", "B-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "I-ARGM-TMP", "O", ], ), ("hearings", ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-V", "O"]), ] assert annotation.named_entities == [ "O", "O", "O", "O", "O", "B-DATE", "I-DATE", "O", "B-DATE", "I-DATE", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, "prosecution", "rest", None, "case", None, None, None, None, "month", None, "hearing", None, ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, None, None, ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))" ) assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} # Check we can handle sentences without verbs. annotation = annotated_sentences[2] assert annotation.document_id == "test/test/03/test_003" assert annotation.sentence_id == 0 assert annotation.words == ["Denise", "Dillon", "Headline", "News", "."] assert annotation.pos_tags == ["NNP", "NNP", "NNP", "NNP", "."] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [None, None, None, None, None] assert annotation.srl_frames == [] assert annotation.named_entities == [ "B-PERSON", "I-PERSON", "B-WORK_OF_ART", "I-WORK_OF_ART", "O", ] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring( "(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))" ) assert annotation.coref_spans == {(2, (0, 1))} # Check we can handle sentences with 2 identical verbs. annotation = annotated_sentences[3] assert annotation.document_id == "test/test/04/test_004" assert annotation.sentence_id == 0 assert annotation.words == [ "and", "that", "wildness", "is", "still", "in", "him", ",", "as", "it", "is", "with", "all", "children", ".", ] assert annotation.pos_tags == [ "CC", "DT", "NN", "VBZ", "RB", "IN", "PRP", ",", "IN", "PRP", "VBZ", "IN", "DT", "NNS", ".", ] assert annotation.word_senses == [ None, None, None, 4.0, None, None, None, None, None, None, 5.0, None, None, None, None, ] assert annotation.predicate_framenet_ids == [ None, None, None, "01", None, None, None, None, None, None, "01", None, None, None, None, ] assert annotation.srl_frames == [ ( "is", [ "B-ARGM-DIS", "B-ARG1", "I-ARG1", "B-V", "B-ARGM-TMP", "B-ARG2", "I-ARG2", "O", "B-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "I-ARGM-ADV", "O", ], ), ( "is", [ "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-ARG1", "B-V", "B-ARG2", "I-ARG2", "I-ARG2", "O", ], ), ] assert annotation.named_entities == [ "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", ] assert annotation.predicate_lemmas == [ None, None, None, "be", None, None, None, None, None, None, "be", None, None, None, None, ] assert annotation.speakers == [ "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", "_Avalon_", ] assert annotation.parse_tree == Tree.fromstring( "(TOP (S (CC and) (NP (DT that) (NN wildness)) " "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP " "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) " "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS " "children))))))) (. .)))" ) assert annotation.coref_spans == {(14, (6, 6))}
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
brat_span_pairs[(predicate, arg_key)] = arg_label return ' '.join(tokens), brat_spans, brat_span_pairs if __name__ == '__main__': parser = argparse.ArgumentParser( 'convert conll 2012 format into brat format') parser.add_argument('--inp', type=str, required=True, help='input dir') parser.add_argument('--out', type=str, required=True, help='output dir') parser.add_argument('--merge', action='store_true', help='merge adjacent same sentences') args = parser.parse_args() ontonotes_reader = Ontonotes() print( 'reading OpenIE instances from dataset files at: {}. The same sentences must be successive' .format(args.inp)) def doc_iter( n_sent ): # treat every n_sent sentence as a document for OpenIE to reduce the number of files doc: List[OntonotesSentence] = [] for conll_file in ontonotes_reader.dataset_path_iterator(args.inp): for sent in ontonotes_reader.sentence_iterator(conll_file): same_as_last = False if args.merge and len(doc) > 0 and ' '.join( sent.words) == ' '.join(doc[-1].words): same_as_last = True
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = 'tests/fixtures/coref/coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
def _read_dataset(self, file_path: str, count_only: bool = False, keep_idx: Optional[Set[int]] = None): """ Yield instances from the file_path. Parameters ---------- file_path: str, required The path to the data file. count_only: bool, optional (default=``False``) If True, no instances are returned and instead a dummy object is returned. This is useful for quickly counting the number of instances in the data file, since creating instances is relatively expensive. keep_idx: Set[int], optional (default=``None``) If not None, only yield instances whose index is in this set. """ # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) # Reseed for reproducibility self._reseed(seed=self._seed) index = 0 ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) text_sentences: List[List[str]] = [s.words for s in sentences] flattened_text_sentences: List[str] = [ self._normalize_word(word) for text_sentence in text_sentences for word in text_sentence ] sentence_arc_indices: List[Tuple[int, int]] = [] sentence_labels: List[str] = [] # Filter the clusters to only have single-token entities # TODO(nfliu): How do we handle spans here? filtered_clusters = filter_clusters(canonical_clusters, max_span_size=1) # Check if there are at least two clusters, each of which has at least 2 different items. # If not, then skip creating examples from this passage. counter = 0 all_cluster_words = [] all_cluster_unique_words = [] for cluster in filtered_clusters: # Get the words that show up in the cluster cluster_words = list( tuple(flattened_text_sentences[index] for index in range(item[0], item[1] + 1)) for item in cluster) all_cluster_words.append(cluster_words) cluster_unique_words = set(cluster_words) all_cluster_unique_words.append(cluster_unique_words) if len(set(cluster_words)) >= 2: counter += 1 if counter < 2: continue if keep_idx is not None and index not in keep_idx: index += 1 continue if count_only: yield 1 continue # Contextualize the tokens if a Contextualizer was provided. # TODO (nfliu): How can we make this batched? # Would make contextualizers that use the GPU much faster. if self._contextualizer: token_representations = self._contextualizer( [flattened_text_sentences])[0] else: token_representations = None # For each cluster with 2+ different items, make positive examples between each of the different items # that are different strings and make negative examples between each of the different items and a # random token from another cluster. assert ((len(filtered_clusters) == len(all_cluster_words)) & (len(all_cluster_words) == len(all_cluster_unique_words))) for cluster_index, (cluster_spans, cluster_words, cluster_unique_words) in enumerate( zip(filtered_clusters, all_cluster_words, all_cluster_unique_words)): # Don't make examples from this if there is only 1 unique item. if len(cluster_unique_words) < 2: continue # Get all combinations of cluster spans (a, b), where a occurs # in the text before b. all_coreferring_spans = [] for parent_cluster_span in cluster_spans: for child_cluster_span in cluster_spans: # Skip child_cluster_span if it occurs before the parent_span. # TODO (nfliu): this is single-word specific if child_cluster_span[0] < parent_cluster_span[0]: continue # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical if (flattened_text_sentences[ child_cluster_span[0]:child_cluster_span[1] + 1] == flattened_text_sentences[ parent_cluster_span[0]: parent_cluster_span[1] + 1]): continue # Add to the set of coreference candidates all_coreferring_spans.append( (child_cluster_span, parent_cluster_span)) # Take the coreference_candidates and generate positive and negative examples for (child_span, parent_span) in all_coreferring_spans: # TODO (nfliu): This is single-word specific, will have to change # if we generalize to spans sentence_arc_indices.append( (child_span[0], parent_span[0])) sentence_labels.append("1") # Generate a negative example for the child. other_clusters = [ cluster for i, cluster in enumerate(filtered_clusters) if i != cluster_index ] negative_coreferent = self._sample_negative_coreferent( other_clusters, child_span[0]) if negative_coreferent: sentence_arc_indices.append( (child_span[0], negative_coreferent[0])) sentence_labels.append("0") yield self.text_to_instance( tokens=flattened_text_sentences, arc_indices=sentence_arc_indices, token_representations=token_representations, labels=sentence_labels) index += 1
def test_dataset_iterator(self): reader = Ontonotes() annotated_sentences = list( reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/')) annotation = annotated_sentences[0] assert annotation.document_id == "test/test/01/test_001" assert annotation.sentence_id == 0 assert annotation.words == [ 'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s", 'confession', 'was', 'forced', '.' ] assert annotation.pos_tags == [ 'NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ', '.' ] assert annotation.word_senses == [ None, None, 1, 1, None, 2, None, None, 1, None, None ] assert annotation.predicate_framenet_ids == [ None, None, None, '01', None, None, None, None, '01', None, None ] assert annotation.srl_frames == [("say", [ 'B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O' ]), ("was", [ 'O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O' ])] assert annotation.named_entities == [ 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, None, 'official', 'say', None, 'man', None, None, 'be', None, None ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP(NML (NNP Mali) (NN government) )" " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP" " (DT the) (NN woman) (POS 's) ) (NN " "confession) )(VP (VBD was) (ADJP (JJ " "forced) ))))) (. .) ))") assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))} annotation = annotated_sentences[1] assert annotation.document_id == "test/test/02/test_002" assert annotation.sentence_id == 0 assert annotation.words == [ 'The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.' ] assert annotation.pos_tags == [ 'DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS', 'IN', 'NNS', '.' ] assert annotation.word_senses == [ None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None ] assert annotation.predicate_framenet_ids == [ None, None, '01', None, None, None, None, None, None, None, None, '01', None ] assert annotation.srl_frames == [('rested', [ 'B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O' ]), ('hearings', [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O' ])] assert annotation.named_entities == [ 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, 'prosecution', 'rest', None, 'case', None, None, None, None, 'month', None, 'hearing', None ] assert annotation.speakers == [ None, None, None, None, None, None, None, None, None, None, None, None, None ] assert annotation.parse_tree == Tree.fromstring( "(TOP(S(NP (DT The) (NN prosecution) )(VP " "(VBD rested) (NP (PRP$ its) (NN case) )" "(NP (JJ last) (NN month) )(PP (IN after) " "(NP(NP (CD four) (NNS months) )(PP (IN" " of) (NP (NNS hearings) ))))) (. .) ))") assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))} # Check we can handle sentences without verbs. annotation = annotated_sentences[2] assert annotation.document_id == 'test/test/03/test_003' assert annotation.sentence_id == 0 assert annotation.words == [ 'Denise', 'Dillon', 'Headline', 'News', '.' ] assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.'] assert annotation.word_senses == [None, None, None, None, None] assert annotation.predicate_framenet_ids == [ None, None, None, None, None ] assert annotation.srl_frames == [] assert annotation.named_entities == [ 'B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O' ] assert annotation.predicate_lemmas == [None, None, None, None, None] assert annotation.speakers == [None, None, None, None, None] assert annotation.parse_tree == Tree.fromstring( "(TOP(FRAG(NP (NNP Denise) " " (NNP Dillon) )(NP (NNP Headline) " "(NNP News) ) (. .) ))") assert annotation.coref_spans == {(2, (0, 1))} # Check we can handle sentences with 2 identical verbs. annotation = annotated_sentences[3] assert annotation.document_id == 'test/test/04/test_004' assert annotation.sentence_id == 0 assert annotation.words == [ 'and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as', 'it', 'is', 'with', 'all', 'children', '.' ] assert annotation.pos_tags == [ 'CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NNS', '.' ] assert annotation.word_senses == [ None, None, None, 4.0, None, None, None, None, None, None, 5.0, None, None, None, None ] assert annotation.predicate_framenet_ids == [ None, None, None, '01', None, None, None, None, None, None, '01', None, None, None, None ] assert annotation.srl_frames == [('is', [ 'B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2', 'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O' ]), ('is', [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ARG1', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O' ])] assert annotation.named_entities == [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] assert annotation.predicate_lemmas == [ None, None, None, 'be', None, None, None, None, None, None, 'be', None, None, None, None ] assert annotation.speakers == [ '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_' ] assert annotation.parse_tree == Tree.fromstring( "(TOP (S (CC and) (NP (DT that) (NN wildness)) " "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP " "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) " "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS " "children))))))) (. .)))") assert annotation.coref_spans == {(14, (6, 6))}
def __init__(self): self.onto_reader = Ontonotes()