def test_end_to_end(self, train_parameters: bool, last_layer_only: bool): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "train_parameters": train_parameters, "last_layer_only": last_layer_only, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def test_end_to_end_t5( self, train_parameters: bool, last_layer_only: bool, gradient_checkpointing: bool, ): tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random") token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params( { "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "patrickvonplaten/t5-tiny-random", "train_parameters": train_parameters, "last_layer_only": last_layer_only, "gradient_checkpointing": gradient_checkpointing, "sub_module": "encoder", } } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True], [True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 8, 64) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def ensure_batch_predictions_are_consistent( self, keys_to_ignore: Iterable[str] = ()): """ Ensures that the model performs the same on a batch of instances as on individual instances. Ignores metrics matching the regexp .*loss.* and those specified explicitly. # Parameters keys_to_ignore : `Iterable[str]`, optional (default=()) Names of metrics that should not be taken into account, e.g. "batch_weight". """ self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict( full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if "loss" in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue if key in keys_to_ignore: continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple( slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose( single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key, ) else: assert single_predicted == batch_predicted, key
def test_long_sequence_splitting_end_to_end(self): # Mostly the same as the end_to_end test (except for adding max_length=4), # because we don't want this splitting behavior to change input/output format. tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "max_length": 4, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) # Adds n_segments * 2 special tokens segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length) assert tokens["bert"]["mask"].tolist() == [ [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], ] assert tokens["bert"]["segment_concat_mask"].tolist() == [ [1] * segment_concat_length, [1] * (segment_concat_length - 4) + [0] * 4, # 4 is hard-coded length difference ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
def transform_collate( vocab, # Use vocab to index the transformed instances reader, # call reader's function to transform instances transform: Callable, instances: List[Instance]) -> TensorDict: new_instances = reader.transform_instances(transform, instances) batch = Batch(new_instances) batch.index_instances(vocab) ret = batch.as_tensor_dict(batch.get_padding_lengths()) return ret
def test_end_to_end_for_first_sub_token_embedding(self, sub_token_mode: str): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", ",", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", "sub_token_mode": sub_token_mode, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, False], [True, True, True, True, True, True], ] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]], [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any()
def test_end_to_end(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # 16 = [CLS], 17 = [SEP] assert tokens["input_ids"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "tokens___tokens": 5 }, "text2": { "tokens___tokens": 6 } }
def test_sliding_window(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["input_ids"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]] bert_vectors = token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def test_end_to_end_with_higher_order_inputs(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"]["bert"] # No offsets, should get 14 vectors back ([CLS] + 12 wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 2, 14, 12] bert_vectors = tlo_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def test_token_without_wordpieces(self): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "", "great"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]], [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]], ] bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any() assert all(bert_vectors[0, 1] == 0) assert all(bert_vectors[1, 1] == 0)
def test_sliding_window_with_batch(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({ "tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer}) }) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert bert_vectors is not None # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert bert_vectors is not None
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = BertPreTokenizer() sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance( {"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] embedder(tokens["input_ids"], tokens["offsets"])
def test_throws_error_on_incorrect_sub_token_mode(self, sub_token_mode: str): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", ",", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", "sub_token_mode": sub_token_mode, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] with pytest.raises(ConfigurationError): token_embedder(tokens)
def test_exotic_tokens_no_nan_grads(self): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = [ "A", "\uf732\uf730\uf730\uf733", "AllenNLP", "sentence", "." ] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() token_embedder = BasicTextFieldEmbedder({ "bert": PretrainedTransformerMismatchedEmbedder("bert-base-uncased") }) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens) test_loss = bert_vectors.mean() test_loss.backward() for name, param in token_embedder.named_parameters(): grad = param.grad assert (grad is None) or (not torch.any(torch.isnan(grad)).item())
def test_padding_for_equal_length_indices(self): tokenizer = BertPreTokenizer() # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance( {"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] assert tokens["input_ids"].tolist() == [[ 16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17 ]] assert tokens["offsets"].tolist() == [[1, 2, 3, 4, 5, 6, 7, 8, 9]]
def test_squad_with_unwordpieceable_passage(self): tokenizer = SpacyTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ( "There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:" ) question1 = "Who released A Study of High Definition Television Systems?" passage2 = ( "Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" def make_reading_comprehension_instance( question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, ) -> Instance: metadata = { "original_passage": passage_text, "token_offsets": [(token.idx, token.idx + len(token.text)) for token in passage_tokens], "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], } fields = { "passage": TextField(passage_tokens, token_indexers), "question": TextField(question_tokens, token_indexers), "metadata": MetadataField(metadata), } return Instance(fields) instance1 = make_reading_comprehension_instance( tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1, ) instance2 = make_reading_comprehension_instance( tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2, ) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"]["bert"] ptokens = tensor_dict["passage"]["bert"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["input_ids"], offsets=ptokens["offsets"]) _ = embedder(qtokens["input_ids"], offsets=qtokens["offsets"])
def allennlp_collate(instances: List[Instance]) -> TensorDict: batch = Batch(instances) return batch.as_tensor_dict(batch.get_padding_lengths())
def _get_training_tensors(self): batch = Batch(self.instances) batch.index_instances(self.vocab) padding = batch.get_padding_lengths() print(padding) return batch.as_tensor_dict(batch.get_padding_lengths())
def sentence_removal_collate( vocab: Vocabulary, instances: List[Instance], probability_of_modified_text: float = 1) -> TensorDict: augmented_instances = [] for instance in instances: sentences = instance["metadata"]["sentences"] removed_sentence_index = random.randint(0, len(sentences) - 1) removed_sentence_length = len(sentences[removed_sentence_index]) modified_sentences = sentences[:removed_sentence_index] + sentences[ removed_sentence_index + 1:] words = [ Token(word) for sentence in modified_sentences for word in sentence ] sentence_index_span_map = instance["metadata"][ "sentence_index_span_map"] spans = [ span for sent_index in range(removed_sentence_index) for span in sentence_index_span_map[sent_index] ] + [ (span[0] - removed_sentence_length, span[1] - removed_sentence_length) for sent_index in range(removed_sentence_index + 1, len(sentences)) for span in sentence_index_span_map[sent_index] ] if len(spans) > 0 and len(sentences) > 1 and random.random( ) < probability_of_modified_text: instance.add_field( "modified_text", TextField(words, instance["text"]._token_indexers)) spans = [ SpanField(span[0], span[1], instance["modified_text"]) for span in spans ] instance.add_field("modified_spans", ListField(spans)) instance["metadata"].metadata["removed_text_start"] = sum( len(s) for s in sentences[:removed_sentence_index]) instance["metadata"].metadata[ "removed_text_end"] = instance["metadata"].metadata[ "removed_text_start"] + removed_sentence_length instance["metadata"].metadata["modified_span_indices"] = [ i for i in range(len(instance["spans"].field_list)) if instance["spans"].field_list[i].span_start < instance["metadata"].metadata["removed_text_start"] or instance["spans"].field_list[i].span_start >= instance["metadata"].metadata["removed_text_end"] ] instance["modified_text"].index(vocab) instance["metadata"].metadata["modified_text_loss"] = True augmented_instances.append(instance) instance2 = deepcopy(instance) instance2["metadata"].metadata["modified_text_loss"] = False augmented_instances.append(instance2) else: instance.add_field("modified_text", instance["text"]) instance.add_field("modified_spans", instance["spans"]) instance["metadata"].metadata["modified_span_indices"] = list( range(len(instance["spans"].field_list))) instance["metadata"].metadata["modified_text_loss"] = True augmented_instances.append(instance) batch = Batch(augmented_instances) return batch.as_tensor_dict(batch.get_padding_lengths())