def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_offsets_with_tokenized_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" sent0 = sent0.split() sent1 = sent1.split() tokens0 = [Token(token) for token in sent0] tokens1 = [Token(token) for token in sent1] vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens0, {"transformer": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens1, {"transformer": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 每个token应该只取一个sub_word代表作为token的特征 assert len(tokens['transformer-offsets'][0]) == len(tokens0) assert len(tokens['transformer-offsets'][1]) == len(tokens1)
def test_encode_decode_with_raw_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" vocab = Vocabulary() instance1 = Instance({ "tokens": TextField([Token(sent0)], {"transformer": token_indexer}) }) instance2 = Instance({ "tokens": TextField([Token(sent1)], {"transformer": token_indexer}) }) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] input_ids = tokens['transformer'] input_ids_0 = [id.item() for id in input_ids[0]] input_ids_1 = [id.item() for id in input_ids[1]] # 原句子应与indexer后的句子保持一致 assert sent0 == token_indexer.tokenizer.decode( input_ids_0, skip_special_tokens=True) assert sent1 == token_indexer.tokenizer.decode( input_ids_1, skip_special_tokens=True)
def forward(self, tree: Tree, label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: str_phase_holder = [] self.collect_phase(tree, str_phase_holder) # tokenize and elmo tokenize instances = [self.text_to_instance(phase) for phase in str_phase_holder] idx, instances = sort_by_padding(instances, [("tokens", "num_tokens")], self.vocab) batch = Batch(instances) pad_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(pad_lengths) tensor_dict = move_to_device(tensor_dict, 0) output = self.biattentive_cell(**tensor_dict) # alert reshape the result to [length, comp, gaussian] # alert here is ugly batch_size, labels = output['weight'].size() labels = labels // self.component_num output['weight'] = output['weight'].reshape(batch_size, labels, self.component_num) output['mu'] = output['mu'].reshape(batch_size, labels, self.component_num, self.gaussian_dim) output['var'] = output['var'].reshape(batch_size, labels, self.component_num, self.gaussian_dim) # resort output result new_idx = [i for i in range(len(instances))] for pos, name in enumerate(idx): new_idx[name] = pos for name, tensor in output.items(): output[name] = torch.stack([tensor[i] for i in new_idx]) return output
def remove_tokens(self, attentions, metadata, threshold, labels): attentions_cpu = attentions.cpu().data.numpy() sentences = [x["tokens"] for x in metadata] instances = [] for b in range(attentions_cpu.shape[0]): sentence = [x for x in sentences[b]] always_keep_mask = metadata[b]['always_keep_mask'] attn = attentions_cpu[b][:len(sentence )] + always_keep_mask * -10000 max_length = math.ceil((1 - always_keep_mask).sum() * threshold) top_ind = np.argsort(attn)[:-max_length] new_tokens = [ x for i, x in enumerate(sentence) if i in top_ind or always_keep_mask[i] == 1 ] instances += metadata[0]["convert_tokens_to_instance"](new_tokens, None) batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return { k: v.to(attentions.device) for k, v in batch["document"].items() }
def test_forward(self): batch_dialogues = Batch(self.instances) res = self.model.forward(**batch_dialogues.as_tensor_dict( batch_dialogues.get_padding_lengths())) print(res)
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17] ] assert tokens["bert-offsets"].tolist() == [ [1, 2, 3, 4, 5, 6, 7, 8, 9] ]
def test_embeddings(self, transformer_name, gold_offsets: torch.LongTensor, use_starting_offsets): self.token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False, use_starting_offsets=use_starting_offsets) self.transformer_embedder = TransformerEmbedder(model_name=transformer_name, trainable=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" tokens0 = sent0.split() tokens1 = sent1.split() tokens0 = [Token(token) for token in tokens0] tokens1 = [Token(token) for token in tokens1] vocab = Vocabulary() instance0 = Instance({"tokens": TextField(tokens0, {"transformer": self.token_indexer})}) instance1 = Instance({"tokens": TextField(tokens1, {"transformer": self.token_indexer})}) batch = Batch([instance0, instance1]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] input_ids = tokens['transformer'] offsets = tokens['transformer-offsets'] transformer_mask = tokens['transformer-mask'] test_select_embeddings = self.transformer_embedder(input_ids, offsets, transformer_mask) transformer_vectors = self.transformer_embedder(token_ids=input_ids, mask=transformer_mask) gold_select_embeddings = get_select_embedding(transformer_vectors, gold_offsets) assert gold_select_embeddings.equal(test_select_embeddings)
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 5, 6, 8, 9, 2, 14, 12] ] assert tokens["bert-offsets"].tolist() == [ [0, 1, 2, 3, 4, 5, 6, 7, 8] ]
def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def _regenerate_tokens(self, metadata, sample_z): sample_z_cpu = sample_z.cpu().data.numpy() tokens = [m["tokens"] for m in metadata] assert len(tokens) == len(sample_z_cpu) assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1] instances = [] new_tokens = [] for words, mask, meta in zip(tokens, sample_z_cpu, metadata): mask = mask[:len(words)] new_words = [ w for i, (w, m) in enumerate(zip(words, mask)) if i == 0 or m == 1 ] new_tokens.append(new_words) meta["new_tokens"] = new_tokens instance = metadata[0]["convert_tokens_to_instance"](new_words, None) instances += instance batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
def predict(instances: List[Instance]) -> List[float]: """Output BERT NSP next sentence probability for a list of instances. Parameters ---------- instances : List[Instance] Returns ------- List[float] BERT NSP scores in range [0, 1]. """ scores = [] for batch_instance in tqdm(batch(instances, batch_size=args.batch_size), total=math.ceil( len(instances) / args.batch_size), desc='Predicting'): batch_ins = Batch(batch_instance) batch_ins.index_instances(VOCAB) tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths()) tokens = tensor_dict["tokens"] input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}')) token_type_ids = tokens['bert-type-ids'].to( torch.device(f'cuda:{GPU_ID}')) input_mask = (input_ids != 0).long() cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask) probs = F.softmax(cls_out, dim=-1) next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist() scores += next_sentence_score return scores
def instances_to_batch(instances, model, for_training, cuda_device=0): batch = Batch(instances) batch.index_instances(model.vocab) padding_lengths = batch.get_padding_lengths() return batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training)
def read_squad_allennlp(file_path): '''read data, build vocab, batch, padding, to idx Args: file_path -- raw squad json file Returns: None ''' token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars")} reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) for instance in instances: question = instance.fields['question'] print (question) print (type(question)) break vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print (len(word2idx)) print (len(char2idx)) print (char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print (padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print (tensor_dict['passage']['tokens'].shape) print (tensor_dict['passage']['chars'].shape) print (tensor_dict['question']['tokens'].shape) print (tensor_dict['question']['chars'].shape) print (tensor_dict['span_start'].shape) print (tensor_dict['span_end'].shape)
def read_squad_word_char(file_path): token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars") } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print(len(word2idx)) print(len(char2idx)) print(char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print(padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print(tensor_dict['passage']['tokens'].shape) print(tensor_dict['passage']['chars'].shape) print(tensor_dict['question']['tokens'].shape) print(tensor_dict['question']['chars'].shape) print(tensor_dict['span_start'].shape) print(tensor_dict['span_end'].shape)
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def test_squad_with_unwordpieceable_passage(self): tokenizer = SpacyTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ( "There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:" ) question1 = "Who released A Study of High Definition Television Systems?" passage2 = ( "Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import ( make_reading_comprehension_instance, ) instance1 = make_reading_comprehension_instance( tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1, ) instance2 = make_reading_comprehension_instance( tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2, ) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def ensure_batch_predictions_are_consistent( self, keys_to_ignore: Iterable[str] = ()): """ Ensures that the model performs the same on a batch of instances as on individual instances. Ignores metrics matching the regexp .*loss.* and those specified explicitly. Parameters ---------- keys_to_ignore : ``Iterable[str]``, optional (default=()) Names of metrics that should not be taken into account, e.g. "batch_weight". """ self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict( full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if "loss" in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue if key in keys_to_ignore: continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple( slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose( single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key, ) else: assert single_predicted == batch_predicted, key
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "The quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "The quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) assert len(tokens1) == 10 assert len(tokens2) == 10 tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2 assert len(tokens) == 22 vocab = Vocabulary() instance = Instance( {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["sentence_pair"] assert tokens['mask'].tolist()[0] == [1] * 22 assert tokens["bert"].tolist()[0] == [ 101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272, 3367, 13971, 17709, 2080 ] assert [ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ] == [ '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', 'elm', '##o' ] assert len(tokens['bert'][0]) == 26 assert tokens["bert-offsets"].tolist()[0] == [ 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 25 ] assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13 bert_vectors = self.token_embedder( tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens['bert-type-ids']) assert list(bert_vectors.shape) == [1, 22, 768]
def test_end_to_end(self): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased" } }, "embedder_to_indexer_map": { "bert": ["bert", "mask"] }, "allow_unmatched_keys": True, }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"].shape == (2, max_length) assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0]] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
def ensure_batch_predictions_are_consistent( self, keys_to_ignore: Iterable[str] = ()): """ Ensures that the model performs the same on a batch of instances as on individual instances. Ignores metrics matching the regexp .*loss.* and those specified explicitly. Parameters ---------- keys_to_ignore : ``Iterable[str]``, optional (default=()) Names of metrics that should not be taken into account, e.g. "batch_weight". """ self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if 'loss' in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue if key in keys_to_ignore: continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { u"text1": { u"num_tokens": 5 }, u"text2": { u"num_tokens": 6 } }
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors[u"text1"][u"tokens"].detach().cpu().numpy() text2 = tensors[u"text2"][u"tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def preprocess(self,data,result_flag=0): xdata =[] for review in data: if(result_flag): xlabel = self.get_label(review) else: # fake label when predicting xlabel = -1 xdata.append(self.datareader.text_to_instance(review['text'],xlabel)) data_batch = Batch(xdata) data_batch.index_instances(self.vocab) data_tensors = data_batch.as_tensor_dict(data_batch.get_padding_lengths()) return data_tensors
def test_end_to_end(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] assert tokens["bert"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["bert-offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def regenerate_tokens(self, tokens_list, metadata, device): instances = [] for words in tokens_list: instance = metadata[0]["convert_tokens_to_instance"](words) instances.append(instance) batch = Batch(instances) batch.index_instances(self._model["model"]._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return {k: v.to(device) for k, v in batch["document"].items()}
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict( full_dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): # This is probably a sequence model, and our output shape has some padded # elements in the batched case. Fixing this in general is complicated; # we'll just fix some easy cases that we actually have, for now. num_tokens = single_predicted.size(0) if batch_predicted.dim() == 1: batch_predicted = batch_predicted[:num_tokens] elif batch_predicted.dim() == 2: batch_predicted = batch_predicted[:num_tokens, :] else: raise NotImplementedError assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def generate_tokens(self, new_tokens, metadata, labels): instances = [] for tokens, instance_labels in zip(new_tokens, labels): instances += metadata[0]["convert_tokens_to_instance"]( tokens, [instance_labels[k] for k in ["A", "B", "C", "D", "E"]] ) batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return {k: v.to(self._vector.device) for k, v in batch["document"].items()}
def test_squad_with_unwordpieceable_passage(self): # pylint: disable=line-too-long tokenizer = WordTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:") question1 = "Who released A Study of High Definition Television Systems?" passage2 = ("Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1) instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def test_sliding_window(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["bert"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["bert-offsets"].tolist() == [[ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]] bert_vectors = token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0], [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] ] assert tokens["bert-offsets"].tolist() == [ [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], [0, 1, 2, 3, 4, 5, 6, 9, 10, 11] ] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def test_end_to_end_with_higher_order_inputs(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def bert_vector(self): words = re.split(r'\W+', self.text) Text = ' '.join(words) tokens = tokenizer.tokenize(Text) instance = Instance( {"tokens": TextField(tokens, {'bert': token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lenghts = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lenghts) Tokens = tensor_dict["tokens"] bert_vectors = model(Tokens["bert"]) return (bert_vectors)
def test_read(self, lazy): reader = SnliReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={ 'bert': PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH) }, ) instances = reader.read( str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens'].tokens] label = example.fields['label'].label weight = example.fields['weight'].weight assert label == 'neutral' assert weight == 1 assert instances[1].fields['weight'].weight == 0.5 assert instances[2].fields['weight'].weight == 1 assert tokens == [ 'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.' ] batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
if (create_video_training): pf.create_image_weights_epoch(model, video_fotograms_folder2, i) pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset, tr_data_loss, val_data_loss, KL_loss, video_fotograms_folder4, i+1) # output = model(tensor_dict["text_field"],tensor_dict["tags_field"]) # loss = output["loss"] # We can get the loss coz we gave the labels as input # gradient and everything. """ ############## Use the trained model ###################### We use an already implemented predictor that takes the model and how to preprocess the data """ name_exmaple = "Eat my motherfucking jeans" name_exmaple = "Carlos Sanchez" tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))] Instance_test = reader.generate_instance(tokens_list,None) batch = Batch([Instance_test]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) model.eval() tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy() tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5}, "text2": {"num_tokens": 6, "tokens_length": 6}}