def test_elmo_empty_token_list(self): indexer = CustomELMoTokenCharactersIndexer() indexer = {'elmo': indexer} tokens_1 = TextField([Token('Apple')], indexer) targets_1 = ListField([TextField([Token('Apple')], indexer)]) tokens_2 = TextField([Token('Screen'), Token('device')], indexer) targets_2 = ListField([ TextField([Token('Screen')], indexer), TextField([Token('Device')], indexer) ]) instance_1 = Instance({'tokens': tokens_1, 'targets': targets_1}) instance_2 = Instance({'tokens': tokens_2, 'targets': targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor['targets']['elmo']['tokens'] empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1] ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
def forward_on_instances(self, instances: List[Instance], **kwargs) -> List[Dict[str, np.ndarray]]: # An exact copy of the original method, but supports kwargs batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.make_output_human_readable( self(**model_input, **kwargs)) instance_separated_output: List[Dict[str, np.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField([ TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer) ]) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"][ "elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
def test_end_to_end(self, train_parameters: bool, last_layer_only: bool): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "train_parameters": train_parameters, "last_layer_only": last_layer_only, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). # Parameters batch : `List[List[str]]`, required A list of tokenized sentences. # Returns A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
def test_end_to_end_t5( self, train_parameters: bool, last_layer_only: bool, gradient_checkpointing: bool, ): tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random") token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params( { "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "patrickvonplaten/t5-tiny-random", "train_parameters": train_parameters, "last_layer_only": last_layer_only, "gradient_checkpointing": gradient_checkpointing, "sub_module": "encoder", } } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True], [True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 8, 64) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) probs = output_dict["class_probabilities"] assert probs.size() == (2, 7, self.model.vocab.get_vocab_size("labels"))
def convert_documents_to_batch(self, documents: List[Tuple[List[Token], List[Token]]], vocabulary) -> Dict[str, Any]: batch = Batch( [self.convert_tokens_to_instance(tokens) for tokens in documents]) batch.index_instances(vocabulary) batch = batch.as_tensor_dict() return batch["document"]
def test_long_sequence_splitting_end_to_end(self): # Mostly the same as the end_to_end test (except for adding max_length=4), # because we don't want this splitting behavior to change input/output format. tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "max_length": 4, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) # Adds n_segments * 2 special tokens segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length) assert tokens["bert"]["mask"].tolist() == [ [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], ] assert tokens["bert"]["segment_concat_mask"].tolist() == [ [1] * segment_concat_length, [1] * (segment_concat_length - 4) + [0] * 4, # 4 is hard-coded length difference ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
def transform_collate( vocab, # Use vocab to index the transformed instances reader, # call reader's function to transform instances transform: Callable, instances: List[Instance]) -> TensorDict: new_instances = reader.transform_instances(transform, instances) batch = Batch(new_instances) batch.index_instances(vocab) ret = batch.as_tensor_dict(batch.get_padding_lengths()) return ret
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def test_end_to_end_for_first_sub_token_embedding(self, sub_token_mode: str): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", ",", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", "sub_token_mode": sub_token_mode, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, False], [True, True, True, True, True, True], ] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]], [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any()
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "tokens___tokens": 5 }, "text2": { "tokens___tokens": 6 } }
def test_end_to_end(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # 16 = [CLS], 17 = [SEP] assert tokens["input_ids"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def get_gradients( self, instances: List[Instance] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ Gets the gradients of the loss with respect to the model inputs. # Parameters instances: List[Instance] # Returns Tuple[Dict[str, Any], Dict[str, Any]] The first item is a Dict of gradient entries for each input. The keys have the form `{grad_input_1: ..., grad_input_2: ... }` up to the number of inputs given. The second item is the model's output. Notes ----- Takes a `JsonDict` representing the inputs of the model and converts them to [`Instances`](../data/instance.md)), sends these through the model [`forward`](../models/model.md#forward) function after registering hooks on the embedding layer of the model. Calls `backward` on the loss and then removes the hooks. """ embedding_gradients: List[Tensor] = [] hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks( embedding_gradients) dataset = Batch(instances) dataset.index_instances(self._model.vocab) dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device) # To bypass "RuntimeError: cudnn RNN backward can only be called in training mode" with backends.cudnn.flags(enabled=False): outputs = self._model.make_output_human_readable( self._model.forward(**dataset_tensor_dict) # type: ignore ) loss = outputs["loss"] self._model.zero_grad() loss.backward() for hook in hooks: hook.remove() grad_dict = dict() for idx, grad in enumerate(embedding_gradients): key = "grad_input_" + str(idx + 1) grad_dict[key] = grad.detach().cpu().numpy() return grad_dict, outputs
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any `torch.Tensors` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. # Parameters instances : List[Instance], required The instances to run the model on. # Returns A list of the models output for each instance. """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) tags = output_dict["tags"] assert len(tags) == 2 assert len(tags[0]) == 7 assert len(tags[1]) == 7 for example_tags in tags: for tag_id in example_tags: tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels") assert tag in {"O", "I-ORG", "I-PER", "I-LOC"}
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def test_sliding_window(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["input_ids"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]] bert_vectors = token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances=instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop("model")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) log_frozen_and_tunable_parameter_names(model)
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ 我省略了复杂繁琐的检查,因为这会导致模型最后可能没有输出 :param instances: :return: """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.decode(self(**model_input)) return outputs
def instances_to_captum_inputs(self, labeled_instances): batch_size = len(labeled_instances) with torch.no_grad(): cuda_device = self._get_prediction_device() batch = Batch(labeled_instances) batch.index_instances(self.vocab) model_input = util.move_to_device(batch.as_tensor_dict(), cuda_device) input_ids = model_input["tokens"]["tokens"]["token_ids"] label = model_input["label"] attention_mask = model_input["tokens"]["tokens"]["mask"] embedded_tokens = self.embeddings(input_ids) output_dict = {} output_dict["embedding"] = embedded_tokens return (embedded_tokens, ), None, (attention_mask, label, output_dict)
def make_vocab_from_params( params: Params, serialization_dir: str, print_statistics: bool = False ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) instances: Iterable[Instance] = ( instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset ) if print_statistics: instances = list(instances) vocab = Vocabulary.from_params(vocab_params, instances=instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab") if print_statistics: dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() return vocab
def test_end_to_end_with_higher_order_inputs(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"]["bert"] # No offsets, should get 14 vectors back ([CLS] + 12 wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [2, 2, 14, 12] bert_vectors = tlo_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def preprocess(self, token_batch): seq_lens = [len(sequence) for sequence in token_batch if sequence] if not seq_lens: return [] max_len = min(max(seq_lens), self.max_len) batches = [] for indexer in self.indexers: batch = [] for sequence in token_batch: tokens = sequence[:max_len] tokens = [Token(token) for token in ['$START'] + tokens] batch.append(Instance({'tokens': TextField(tokens, indexer)})) batch = Batch(batch) batch.index_instances(self.vocab) batches.append(batch) return batches
class TestLSTMPointerForRewrite(TestCase): def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params( params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests")) def test_model_can_train_save_and_load(self): save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" # test train and save model = train_model_from_file(self.param_file, save_dir) # test load loaded_model = load_archive(archive_file, cuda_device=-1).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # make sure that the state dict (the parameters) are the same # for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key)
def setup_model(params_file, dataset_file): params = Params.from_file(params_file) #reader = DatasetReader.from_params(params['dataset_reader']) reader = ToxicReader() instances = reader.read(str(dataset_file)) Vocabulary.from_instances(instances) if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) vocab.save_to_files("new_vocab2") dataset = Batch(instances) dataset.index_instances(vocab) print(dataset.as_tensor_dict())
def test_token_without_wordpieces(self): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "", "great"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]], [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]], ] bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any() assert all(bert_vectors[0, 1] == 0) assert all(bert_vectors[1, 1] == 0)
def test_sliding_window_with_batch(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({ "tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer}) }) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert bert_vectors is not None # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert bert_vectors is not None