def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) new_sentences = [s.words for s in sentences] flattened_sentences = [ self._normalize_word(word) for sentence in new_sentences for word in sentence ] def tokenizer(s: str): return self.token_indexer.wordpiece_tokenizer(s) flattened_sentences = tokenizer(" ".join(flattened_sentences)) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() i = 0 for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) percent_user_spans = 0.0 if self._simulate_user_inputs and i >= self._fully_labelled_threshold: percent_user_spans = 1.0 i += 1 yield self.text_to_instance([s.words for s in sentences], sentences[0].document_id, sentences[0].sentence_id, canonical_clusters, percent_user_spans)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 speakers = [] for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) speakers.append(sentence.speakers) doc_key = sentences[0].document_id genre = self.genres[doc_key[:2]] speakers = self.flatten(speakers) assert total_tokens == len(speakers) speaker_dict = {s: i for i, s in enumerate(set(speakers))} speaker_ids = np.array([speaker_dict[s] for s in speakers]) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) instance = self.text_to_instance([s.words for s in sentences], canonical_clusters) instances.append(instance) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = 'tests/fixtures/coref/coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
def _read_dataset(self, file_path: str, count_only: bool = False, keep_idx: Optional[Set[int]] = None): """ Yield instances from the file_path. Parameters ---------- file_path: str, required The path to the data file. count_only: bool, optional (default=``False``) If True, no instances are returned and instead a dummy object is returned. This is useful for quickly counting the number of instances in the data file, since creating instances is relatively expensive. keep_idx: Set[int], optional (default=``None``) If not None, only yield instances whose index is in this set. """ # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) # Reseed for reproducibility self._reseed(seed=self._seed) index = 0 ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) text_sentences: List[List[str]] = [s.words for s in sentences] flattened_text_sentences: List[str] = [ self._normalize_word(word) for text_sentence in text_sentences for word in text_sentence ] sentence_arc_indices: List[Tuple[int, int]] = [] sentence_labels: List[str] = [] # Filter the clusters to only have single-token entities # TODO(nfliu): How do we handle spans here? filtered_clusters = filter_clusters(canonical_clusters, max_span_size=1) # Check if there are at least two clusters, each of which has at least 2 different items. # If not, then skip creating examples from this passage. counter = 0 all_cluster_words = [] all_cluster_unique_words = [] for cluster in filtered_clusters: # Get the words that show up in the cluster cluster_words = list( tuple(flattened_text_sentences[index] for index in range(item[0], item[1] + 1)) for item in cluster) all_cluster_words.append(cluster_words) cluster_unique_words = set(cluster_words) all_cluster_unique_words.append(cluster_unique_words) if len(set(cluster_words)) >= 2: counter += 1 if counter < 2: continue if keep_idx is not None and index not in keep_idx: index += 1 continue if count_only: yield 1 continue # Contextualize the tokens if a Contextualizer was provided. # TODO (nfliu): How can we make this batched? # Would make contextualizers that use the GPU much faster. if self._contextualizer: token_representations = self._contextualizer( [flattened_text_sentences])[0] else: token_representations = None # For each cluster with 2+ different items, make positive examples between each of the different items # that are different strings and make negative examples between each of the different items and a # random token from another cluster. assert ((len(filtered_clusters) == len(all_cluster_words)) & (len(all_cluster_words) == len(all_cluster_unique_words))) for cluster_index, (cluster_spans, cluster_words, cluster_unique_words) in enumerate( zip(filtered_clusters, all_cluster_words, all_cluster_unique_words)): # Don't make examples from this if there is only 1 unique item. if len(cluster_unique_words) < 2: continue # Get all combinations of cluster spans (a, b), where a occurs # in the text before b. all_coreferring_spans = [] for parent_cluster_span in cluster_spans: for child_cluster_span in cluster_spans: # Skip child_cluster_span if it occurs before the parent_span. # TODO (nfliu): this is single-word specific if child_cluster_span[0] < parent_cluster_span[0]: continue # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical if (flattened_text_sentences[ child_cluster_span[0]:child_cluster_span[1] + 1] == flattened_text_sentences[ parent_cluster_span[0]: parent_cluster_span[1] + 1]): continue # Add to the set of coreference candidates all_coreferring_spans.append( (child_cluster_span, parent_cluster_span)) # Take the coreference_candidates and generate positive and negative examples for (child_span, parent_span) in all_coreferring_spans: # TODO (nfliu): This is single-word specific, will have to change # if we generalize to spans sentence_arc_indices.append( (child_span[0], parent_span[0])) sentence_labels.append("1") # Generate a negative example for the child. other_clusters = [ cluster for i, cluster in enumerate(filtered_clusters) if i != cluster_index ] negative_coreferent = self._sample_negative_coreferent( other_clusters, child_span[0]) if negative_coreferent: sentence_arc_indices.append( (child_span[0], negative_coreferent[0])) sentence_labels.append("0") yield self.text_to_instance( tokens=flattened_text_sentences, arc_indices=sentence_arc_indices, token_representations=token_representations, labels=sentence_labels) index += 1
return doc_str, spans, span_pairs if __name__ == '__main__': parser = argparse.ArgumentParser( 'convert conll 2012 format into brat format') parser.add_argument('--inp', type=str, required=True, help='input dir') parser.add_argument('--out', type=str, required=True, help='output dir') args = parser.parse_args() print('reading coref instances from dataset files at: {}'.format(args.inp)) avg_cluster_size = [] ontonotes_reader = Ontonotes() for docid, doc in tqdm( enumerate(ontonotes_reader.dataset_document_iterator(args.inp))): docid += 1 clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) total_tokens = 0 for sentence in doc: for typed_span in sentence.coref_spans: span_id, (start, end) = typed_span # both start and end are inclusive clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) avg_cluster_size.extend([len(c) for c in canonical_clusters]) doc_str, spans, span_pairs = cluster_to_brat([s.words for s in doc],
def test_ontonotes_can_read_conll_file_with_multiple_documents(self): reader = Ontonotes() file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll' documents = list(reader.dataset_document_iterator(file_path)) assert len(documents) == 2
class ConllReader(Reader): def __init__(self): self.onto_reader = Ontonotes() def get_srl_ents(self, srl_frame): ents, rels = [], [] begin = False arg = None start, end = -1, -1 for idx, tag in enumerate(srl_frame): if tag.startswith("B") and not begin: begin = True start = idx arg = tag[2:] elif begin and tag.startswith("I"): assert arg == tag[2:], (arg, tag[2:]) continue elif begin: if tag == "O": begin = False end = idx ents.append((start, end)) rels.append((arg, (start, end))) start = end = -1 elif tag.startswith("B"): end = idx ents.append((start, end)) rels.append((arg, (start, end))) start = idx end = -1 arg = tag[2:] if len(rels) <= 1: return ents, [] final_rels, verb_span = [], None for tag, mention in rels: if tag == "V": verb_span = mention break if not verb_span: # we did not find a verb return ents, final_rels for tag, mention in rels: if tag != "V": final_rels.append((tag, ("head", verb_span), ("dep", mention))) return ents, final_rels def get_coref_ents(self, cc): ents, clusters, rels = [], defaultdict(list), [] for idx, mention in cc: mention = ( mention[0], mention[1] + 1, ) # we add one to maintain python slicing ents.append(mention) clusters[idx].append(mention) for cluster in clusters.values(): if len(cluster) <= 1: continue ant = cluster[0] for mention in cluster[1:]: rels.append(("COREF", ("head", ant), ("dep", mention))) return ents, rels def w2c(self, text, ents): doc = nlp(text) w2c = dict() for word in doc: w2c[word.i] = (word.idx, word.idx + len(word)) cents = {ent: (w2c[ent[0]][0], w2c[ent[1] - 1][1]) for ent in ents} return w2c, cents def make_ter(self, sent): text = " ".join(sent.words) # one sentence can have multiple srl annotations # depending on how many verbs it has to_return = [] for _, frame in sent.srl_frames: ents, rels = [], [] e_srl, r_srl = self.get_srl_ents(frame) e_coref, r_coref = self.get_coref_ents(sent.coref_spans) ents.extend(e_srl + e_coref) rels.extend(r_srl + r_coref) _, cents = self.w2c(text, set(ents)) ent_dict = { k: { "id": "E" + str(n), "cid": v } for n, (k, v) in enumerate(cents.items()) } ents = [["E" + str(n), "", [list(v)]] for n, (_, v) in enumerate(cents.items())] for idx, rel in enumerate(rels): (tag, (head, e1), (dep, e2)) = rel rels[idx] = [ "R" + str(idx), tag, [[head, ent_dict[e1]["id"]], [dep, ent_dict[e2]["id"]]], ] to_return.append([text, ents, rels]) return to_return def read(self, fpath: str): for doc in self.onto_reader.dataset_document_iterator(fpath): for sent in doc: to_return = self.make_ter(sent) for text, ents, rels in to_return: yield text, ents, rels