def instance_stream( file_path: str, tokenizer: PretrainedTransformerTokenizer, token_indexers: Dict[str, PretrainedTransformerIndexer], model_input_size: int = 512, normalize: bool = False) -> Generator[Instance, None, None]: file_names = os.listdir(file_path) for fn in file_names: if args.file_suffix not in fn: # '.json', '.comm' continue if args.file_suffix == '.json': with open(os.path.join(file_path, fn)) as f: data = json.load(f) sentences = data['sentences'] doc_key = data['doc_key'] else: doc = CementDocument.from_communication_file( file_path=os.path.join(file_path, fn)) sentences = list(doc.iterate_sentences()) doc_key = str(doc.comm.id) if normalize: sentences = [[normalize_token(t) for t in sent] for sent in sentences] tokenized_context_sentences: List[Tuple[List[List[Token]], List[Tuple[int, int]], List[str]]] = [] for sent in sentences: tokenized_sent, offsets = tokenizer.intra_word_tokenize(sent) if len(tokenized_sent) > model_input_size: logger.info('Segmented long sentence.') tokenized_context_sentences.append( (segment_long_sentence(tokenized_sent, model_input_size), offsets, sent)) else: tokenized_context_sentences.append( ([tokenized_sent], offsets, sent)) for sent_id, (sent_token_list, sent_offsets, sent) in enumerate(tokenized_context_sentences): for i, sent_tokens in enumerate(sent_token_list): # print(f'{[doc_key, str(sent_id)]}') yield construct_instance(tokens=sent_tokens, offsets=sent_offsets, key=[doc_key, str(sent_id)], segment=i, raw_sentence=sent, token_indexers=token_indexers)
def test_intra_word_tokenize(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased") sentence = "A, [MASK] AllenNLP sentence.".split(" ") expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", ] expected_offsets = [(1, 2), (3, 3), (4, 6), (7, 8)] tokens, offsets = tokenizer.intra_word_tokenize(sentence) tokens = [t.text for t in tokens] assert tokens == expected_tokens assert offsets == expected_offsets # sentence pair sentence_1 = "A, [MASK] AllenNLP sentence.".split(" ") sentence_2 = "A sentence.".split(" ") expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", "A", "sentence", ".", "[SEP]", ] expected_offsets_a = [(1, 2), (3, 3), (4, 6), (7, 8)] expected_offsets_b = [(10, 10), (11, 12)] tokens, offsets_a, offsets_b = tokenizer.intra_word_tokenize_sentence_pair( sentence_1, sentence_2) tokens = [t.text for t in tokens] assert tokens == expected_tokens assert offsets_a == expected_offsets_a assert offsets_b == expected_offsets_b
def test_intra_word_tokenize_whitespaces(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased") sentence = ["A,", " ", "[MASK]", "AllenNLP", "\u007f", "sentence."] expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", ] expected_offsets = [(1, 2), None, (3, 3), (4, 6), None, (7, 8)] tokens, offsets = tokenizer.intra_word_tokenize(sentence) tokens = [t.text for t in tokens] assert tokens == expected_tokens assert offsets == expected_offsets
def make_coref_instance( sentences: List[List[str]], token_indexers: Dict[str, TokenIndexer], max_span_width: int, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, wordpiece_modeling_tokenizer: PretrainedTransformerTokenizer = None, max_sentences: int = None, ) -> Instance: """ # Parameters sentences : `List[List[str]]`, required. A list of lists representing the tokenised words and sentences in the document. token_indexers : `Dict[str, TokenIndexer]` This is used to index the words in the document. See :class:`TokenIndexer`. max_span_width : `int`, required. The maximum width of candidate spans to consider. gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None) A list of all clusters in the document, represented as word spans with absolute indices in the entire document. Each cluster contains some number of spans, which can be nested and overlap. If there are exact matches between clusters, they will be resolved using `_canonicalize_clusters`. wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = None) If not None, this dataset reader does subword tokenization using the supplied tokenizer and distribute the labels to the resulting wordpieces. All the modeling will be based on wordpieces. If this is set to `False` (default), the user is expected to use `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`, and the modeling will be on the word-level. max_sentences: int, optional (default = None) The maximum number of sentences in each document to keep. By default keeps all sentences. # Returns An `Instance` containing the following `Fields`: text : `TextField` The text of the full document. spans : `ListField[SpanField]` A ListField containing the spans represented as `SpanFields` with respect to the document text. span_labels : `SequenceLabelField`, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a `SequenceLabelField` with respect to the `spans `ListField`. """ if max_sentences is not None and len(sentences) > max_sentences: sentences = sentences[:max_sentences] total_length = sum(len(sentence) for sentence in sentences) if gold_clusters is not None: new_gold_clusters = [] for cluster in gold_clusters: new_cluster = [] for mention in cluster: if mention[1] < total_length: new_cluster.append(mention) if new_cluster: new_gold_clusters.append(new_cluster) gold_clusters = new_gold_clusters flattened_sentences = [ _normalize_word(word) for sentence in sentences for word in sentence ] if wordpiece_modeling_tokenizer is not None: flat_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize( flattened_sentences) flattened_sentences = [t.text for t in flat_sentences_tokens] else: flat_sentences_tokens = [Token(word) for word in flattened_sentences] text_field = TextField(flat_sentences_tokens, token_indexers) cluster_dict = {} if gold_clusters is not None: gold_clusters = _canonicalize_clusters(gold_clusters) if wordpiece_modeling_tokenizer is not None: for cluster in gold_clusters: for mention_id, mention in enumerate(cluster): start = offsets[mention[0]][0] end = offsets[mention[1]][1] cluster[mention_id] = (start, end) for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=max_span_width): if wordpiece_modeling_tokenizer is not None: start = offsets[start][0] end = offsets[end][1] # `enumerate_spans` uses word-level width limit; here we apply it to wordpieces # We have to do this check here because we use a span width embedding that has # only `max_span_width` entries, and since we are doing wordpiece # modeling, the span width embedding operates on wordpiece lengths. So a check # here is necessary or else we wouldn't know how many entries there would be. if end - start + 1 > max_span_width: continue # We also don't generate spans that contain special tokens if start < wordpiece_modeling_tokenizer.num_added_start_tokens: continue if (end >= len(flat_sentences_tokens) - wordpiece_modeling_tokenizer.num_added_end_tokens): continue if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata: Dict[str, Any] = {"original_text": flattened_sentences} if gold_clusters is not None: metadata["clusters"] = gold_clusters metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field, } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def make_coref_instance( sentences: List[List[str]], token_indexers: Dict[str, TokenIndexer], max_span_width: int, document_id: Optional[str] = None, words: List[str] = None, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, srl_frames: Optional[List[Tuple[int, List[Tuple[int, int, str]]]]] = None, include_srl: bool = False, named_entities: Optional[List[str]] = None, named_entity_spans: Optional[List[Tuple[int, int, str]]] = None, include_ner: bool = False, include_coref: bool = True, wordpiece_modeling_tokenizer: PretrainedTransformerTokenizer = None, max_sentences: int = None, remove_singleton_clusters: bool = True, span_label_map: Dict[Tuple[int,int], str] = None, language: str = None, sentence_objects: List[OntonotesSentence] = None, parallel_sentences: List[List[str]] = None, ) -> Instance: """ # Parameters sentences : `List[List[str]]`, required. A list of lists representing the tokenised words and sentences in the document. token_indexers : `Dict[str, TokenIndexer]` This is used to index the words in the document. See :class:`TokenIndexer`. max_span_width : `int`, required. The maximum width of candidate spans to consider. gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = `None`) A list of all clusters in the document, represented as word spans with absolute indices in the entire document. Each cluster contains some number of spans, which can be nested and overlap. If there are exact matches between clusters, they will be resolved using `_canonicalize_clusters`. wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = `None`) If not None, this dataset reader does subword tokenization using the supplied tokenizer and distribute the labels to the resulting wordpieces. All the modeling will be based on wordpieces. If this is set to `False` (default), the user is expected to use `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`, and the modeling will be on the word-level. max_sentences: `int`, optional (default = `None`) The maximum number of sentences in each document to keep. By default keeps all sentences. remove_singleton_clusters : `bool`, optional (default = `True`) Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows the removal of them. # Returns An `Instance` containing the following `Fields`: text : `TextField` The text of the full document. spans : `ListField[SpanField]` A ListField containing the spans represented as `SpanFields` with respect to the document text. span_labels : `SequenceLabelField`, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a `SequenceLabelField` with respect to the spans `ListField`. """ if max_sentences is not None and len(sentences) > max_sentences: sentences = sentences[:max_sentences] total_length = sum(len(sentence) for sentence in sentences) if gold_clusters is not None: new_gold_clusters = [] for cluster in gold_clusters: new_cluster = [] for mention in cluster: if mention[1] < total_length: new_cluster.append(mention) if new_cluster: new_gold_clusters.append(new_cluster) gold_clusters = new_gold_clusters flattened_sentences = [_normalize_word(word) for sentence in sentences for word in sentence] sentences = [[_normalize_word(word) for word in sentence] for sentence in sentences] if parallel_sentences is not None: parallel_sentences = [[_normalize_word(word) for word in sentence] for sentence in parallel_sentences] flattened_parallel_sentences = [word for sentence in parallel_sentences for word in sentence] if words is not None: flattened_sentences = [_normalize_word(word) for word in words] if language is not None and language == "arabic": flattened_sentences = [clean_arabic_text(word.split("#")[0]) for word in flattened_sentences] sentences = [[clean_arabic_text(word.split("#")[0]) for word in sentence] for sentence in sentences] if parallel_sentences is not None: parallel_sentences = [[clean_arabic_text(word.split("#")[0]) for word in sentence] for sentence in parallel_sentences] flattened_parallel_sentences = [word for sentence in parallel_sentences for word in sentence] if wordpiece_modeling_tokenizer is not None: flat_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize( flattened_sentences ) flattened_sentences = [t.text for t in flat_sentences_tokens] if parallel_sentences is not None: flat_parallel_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize( flattened_sentences ) else: flat_sentences_tokens = [Token(word) for word in flattened_sentences] if parallel_sentences is not None: flat_parallel_sentences_tokens = [Token(word) for word in flattened_parallel_sentences] text_field = TextField(flat_sentences_tokens, token_indexers) cluster_dict = {} if gold_clusters is not None: gold_clusters = _canonicalize_clusters(gold_clusters) if remove_singleton_clusters: gold_clusters = [cluster for cluster in gold_clusters if len(cluster) > 1] if wordpiece_modeling_tokenizer is not None: for cluster in gold_clusters: for mention_id, mention in enumerate(cluster): start = offsets[mention[0]][0] end = offsets[mention[1]][1] cluster[mention_id] = (start, end) for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_index_map: Dict[Tuple[int, int], int] = {} token_same_sentence_spans: Dict[int, List[Tuple[int, int]]] = {} token_sentence_start_end_map: Dict[int, Tuple[int, int]] = {} sentence_index_span_map: Dict[int, Tuple[int, int]] = {} span_labels: Optional[List[Union[int,str]]] = [] if gold_clusters is not None else None sentence_offset = 0 sentence_offsets = [] for sent_index, sentence in enumerate(sentences): sentence_spans = [] sentence_index_span_map[sent_index] = [] for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=max_span_width ): if wordpiece_modeling_tokenizer is not None: start = offsets[start][0] end = offsets[end][1] # `enumerate_spans` uses word-level width limit; here we apply it to wordpieces # We have to do this check here because we use a span width embedding that has # only `max_span_width` entries, and since we are doing wordpiece # modeling, the span width embedding operates on wordpiece lengths. So a check # here is necessary or else we wouldn't know how many entries there would be. if end - start + 1 > max_span_width: continue # We also don't generate spans that contain special tokens if start < len(wordpiece_modeling_tokenizer.single_sequence_start_tokens): continue if end >= len(flat_sentences_tokens) - len( wordpiece_modeling_tokenizer.single_sequence_end_tokens ): continue if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) if span_label_map is not None: if (start, end) in span_label_map: span_labels[-1] = span_label_map[(start, end)] else: span_labels[-1] = "O" if end <= len(flat_sentences_tokens)-1: span = (start, end) span_index_map[span] = len(spans) sentence_spans.append(len(spans)) spans.append(SpanField(start, end, text_field)) sentence_index_span_map[sent_index].append((start, end)) for i in range(len(sentence)): token_same_sentence_spans[i+sentence_offset] = sentence_spans token_sentence_start_end_map[i+sentence_offset] = (sentence_offset, sentence_offset+len(sentence)-1) sentence_offsets.append(sentence_offset) sentence_offset += len(sentence) if len(spans) == 0: return None span_field = ListField(spans) metadata: Dict[str, Any] = {"original_text": flattened_sentences, "sentence_offsets": sentence_offsets, "sentences": sentences, "sentence_index_span_map": sentence_index_span_map, "span_index_map": span_index_map} if gold_clusters is not None: metadata["clusters"] = gold_clusters if language is not None: metadata["language"] = language if sentence_objects is not None: metadata["sentence_objects"] = sentence_objects metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field, } if span_labels is not None and include_coref: fields["span_labels"] = SequenceLabelField(span_labels, span_field, label_namespace="span_labels") if parallel_sentences is not None: fields["parallel_text"] = TextField(flat_parallel_sentences_tokens, token_indexers) if include_srl and srl_frames is not None: predicate_span_pairs = [] pair_labels = [] filtered_srl_frames = [] for predicate_index, arguments in srl_frames: filtered_arguments = [] covered_spans = set() for (start, end, arg_type) in arguments: if (start, end) in span_index_map and (start, end) not in covered_spans: if start == predicate_index == end: continue predicate_span_pairs.append((predicate_index, span_index_map[(start, end)])) pair_labels.append(arg_type) filtered_arguments.append((start, end, arg_type)) covered_spans.add((start, end)) arguments_without_predicate = [arg for arg in arguments if arg[-1] != "V"] if len(arguments_without_predicate) > 0: filtered_srl_frames.append((predicate_index, arguments_without_predicate)) if len(set([arg[:2] for arg in arguments])) < len([arg[:2] for arg in arguments]): print(predicate_index, arguments) print(flattened_sentences) if len(filtered_srl_frames) < len(srl_frames) and predicate_index == srl_frames[-1][0]: print(predicate_index, arguments, filtered_arguments) print('B', srl_frames, filtered_srl_frames) print(flattened_sentences) # if len(predicate_span_pairs) > 0: fields["srl_labels"] = AsymmetricAdjacencyField(predicate_span_pairs, text_field, span_field, labels=pair_labels, label_namespace="srl_labels") srl_seq_label_fields = [] srl_seq_labels = [] srl_seq_indices = [] srl_seq_words = [] predicate_indices = [] max_seq_length = 0 for frame in srl_frames: predicate_index, arguments = frame if predicate_index >= len(flat_sentences_tokens): continue sentence_start, sentence_end = token_sentence_start_end_map[predicate_index] seq_labels = ["O" for _ in range(sentence_start, sentence_end+1)] seq_labels[predicate_index-sentence_start] = "B-V" for (start, end, arg_type) in arguments: if any([seq_labels[idx-sentence_start] != "O" for idx in range(start, end+1)]): continue seq_labels[start-sentence_start] = "B-"+arg_type for i in range(start+1, end+1): seq_labels[i-sentence_start] = "I-"+arg_type srl_seq_indices.append(list(range(sentence_start, sentence_end+1))) sentence_field = TextField(flat_sentences_tokens[sentence_start:sentence_end+1], token_indexers) seq_label_field = SequenceLabelField(seq_labels, sentence_field, label_namespace="srl_seq_labels") srl_seq_label_fields.append(seq_label_field) predicate_indices.append(predicate_index) srl_seq_labels.append(seq_labels) srl_seq_words.append([word for word in flattened_sentences[sentence_start:sentence_end+1]]) max_seq_length = max(max_seq_length, sentence_end+1-sentence_start) if len(srl_seq_label_fields) > 0 and named_entity_spans is None: fields["srl_seq_labels"] = ListField(srl_seq_label_fields) srl_seq_indices = [seq+[-1 for _ in range(max_seq_length-len(seq))] for seq in srl_seq_indices] fields["srl_seq_indices"] = ArrayField(np.array(srl_seq_indices, dtype=np.int64), dtype=np.int64, padding_value=-1) fields["srl_seq_predicates"] = ArrayField(np.array(predicate_indices, dtype=np.int64), dtype=np.int64, padding_value=-1) metadata["srl_seq_labels"] = srl_seq_labels metadata["srl_seq_words"] = srl_seq_words metadata["srl_frames"] = filtered_srl_frames word_span_coincidence = [] for token in range(len(flat_sentences_tokens)): for span_index in token_same_sentence_spans[token]: word_span_coincidence.append((token, span_index)) fields["word_span_mask"] = AsymmetricAdjacencyField(word_span_coincidence, text_field, span_field, padding_value=0) if include_ner and named_entities is not None: remap = {"B-OTHER": "O", "I-OTHER": "O", "B-NUMBER": "B-QUANTITY", "I-NUMBER": "I-QUANTITY"} named_entities = [ent if ent not in remap else remap[ent] for ent in named_entities] if wordpiece_modeling_tokenizer is not None: converted_named_entities = ["O" for _ in flat_sentences_tokens] for index, ne in enumerate(named_entities): if ne != "O": converted_named_entities[offsets[index][0]] = ne for i in range(offsets[index][0]+1, offsets[index][1]+1): converted_named_entities[i] = "I-"+ne[2:] named_entities = converted_named_entities fields["ner_seq_labels"] = SequenceLabelField(named_entities[:len(flat_sentences_tokens)], text_field, label_namespace="ner_seq_labels") metadata["ner_seq_labels"] = named_entities[:len(flat_sentences_tokens)] if named_entity_spans is not None: ner_span_label_map = {(start, end): label for (start, end, label) in named_entity_spans} ner_span_labels = [None for _ in span_index_map] for span in span_index_map: if span in ner_span_label_map: ner_span_labels[span_index_map[span]] = ner_span_label_map[span] else: ner_span_labels[span_index_map[span]] = "None" fields["ner_span_labels"] = SequenceLabelField(ner_span_labels, span_field, label_namespace="ner_span_labels") metadata["document_id"] = document_id return Instance(fields)