def test_forward_pass_runs_correctly(self): """ Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same results as the model itself. """ bidaf_ensemble = BidafEnsemble([self.model, self.model]) batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() bidaf_output_dict = self.model(**training_tensors) ensemble_output_dict = bidaf_ensemble(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics['f1'] > 0 assert torch.equal(ensemble_output_dict['best_span'], bidaf_output_dict['best_span']) assert ensemble_output_dict['best_span_str'] == bidaf_output_dict['best_span_str']
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 5, 6, 8, 9, 2, 14, 12] ] assert tokens["bert-offsets"].tolist() == [ [0, 1, 2, 3, 4, 5, 6, 7, 8] ]
def forward_on_instances(self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) tags = output_dict['tags'] assert len(tags) == 2 assert len(tags[0]) == 7 assert len(tags[1]) == 7 for example_tags in tags: for tag_id in example_tags: tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels") assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
def test_squad_with_unwordpieceable_passage(self): # pylint: disable=line-too-long tokenizer = WordTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:") question1 = "Who released A Study of High Definition Television Systems?" passage2 = ("Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1) instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def forward_on_instances(self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- instances : List[Instance], required The instances to run the model on. cuda_device : int, required The GPU device to use. -1 means use the CPU. Returns ------- A list of the models output for each instance. """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0], [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] ] assert tokens["bert-offsets"].tolist() == [ [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], [0, 1, 2, 3, 4, 5, 6, 9, 10, 11] ] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer, 'tokens': indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): # organizing instances per question intances_question_id = [ instance.fields['metadata'].metadata['question_id'] for instance in instance_list ] split_inds = [0] for ind in range(len(intances_question_id) - 1): if intances_question_id[ind] != intances_question_id[ind + 1]: split_inds.append(ind + 1) split_inds += [len(intances_question_id)] per_question_instances = [ instance_list[split_inds[ind]:split_inds[ind + 1]] for ind in range(len(split_inds) - 1) ] # added by Mingzhu, batch shuffle, each batch only contains examples from one dataset. batch_dict = {} for question_instances in per_question_instances: set_name = question_instances[0].fields["metadata"]["dataset"] batch_dict.setdefault(set_name, []) instances_to_add = question_instances batch_dict[set_name] += instances_to_add for name, batch in batch_dict.items(): if len(batch) + len( instances_to_add) > self._batch_size and len( batch) > 0: batch = sorted(batch, key=lambda x: x.fields['metadata']. metadata['question_id']) yield Batch(batch) batch_dict[name] = [] # yielding remainder batch for name, batch in batch_dict.items(): if len(batch) > 0: batch = sorted(batch, key=lambda x: x.fields['metadata'].metadata[ 'question_id']) yield Batch(batch)
def forward(self, inputs, elmo_lstm_output): texts = self.inputs_to_texts(inputs) instances = self.texts_to_instances(texts) dataset = Batch(instances) dataset.index_instances(self.model.vocab) dp_inputs = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device) words, pos_tags = dp_inputs['words'], dp_inputs['pos_tags'] mask = get_text_field_mask(words) layer_activations = elmo_lstm_output['activations'] mask_with_bos_eos = elmo_lstm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) if self._keep_sentence_boundaries: processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) # reshape if necessary mask = processed_mask elmo_representations = representations embedded_text_input = elmo_representations[0] embedded_pos_tags = self.model._pos_tag_embedding(pos_tags) embedded_text_input = torch.cat( [embedded_text_input, embedded_pos_tags], -1) encoded_text = self.model.encoder(embedded_text_input, mask) return encoded_text.detach()
def setUp(self): self.token_lookup = { 'Entity1': [['Robert', 'Logan'], ['Robby']], 'Entity2': [['Jimmy']] } self.id_map_lookup = { 'Entity1': { 'Robert': 1, 'Logan': 2, 'Robby': 3 }, 'Entity2': { 'Jimmy': 1 } } self.id_array_lookup = { 'Entity1': np.array([[1, 2], [3, 0]], dtype=int), 'Entity2': np.array([[1]], dtype=int) } self.token_to_entity_lookup = { 'Robert': {'Entity1'}, 'Logan': {'Entity1'}, 'Robby': {'Entity1'}, 'Jimmy': {'Entity2'} } token_indexer = SingleIdTokenIndexer() entity_indexer = SingleIdTokenIndexer(namespace='entity_ids') text_field = TextField([ Token(t) for t in ['Robby', 'is', 'a', 'nickname', 'for', 'Robert'] ], {'tokens': token_indexer}) entity_field = TextField( [Token(t) for t in ['Entity1', '', '', '', '', 'Entity1']], {'entity_ids': entity_indexer}) self.instance = Instance({ 'tokens': text_field, 'entity_identifiers': entity_field }) self.dataset = Batch([self.instance]) self.vocab = Vocabulary.from_instances(self.dataset) self.dataset.index_instances(self.vocab) super(AliasDatabaseTest, self).setUp()
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch( [Instance({ "text1": text_field1, "text2": text_field2 })]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": [] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"] }) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances)
class DialogQATest(ModelTestCase): def setUp(self): super().setUp() self.set_up_model(self.FIXTURES_ROOT / 'dialog_qa' / 'experiment.json', self.FIXTURES_ROOT / 'data' / 'quac_sample.json') self.batch = Batch(self.instances) self.batch.index_instances(self.vocab) def test_forward_pass_runs_correctly(self): training_tensors = self.batch.as_tensor_dict() output_dict = self.model(**training_tensors) assert "best_span_str" in output_dict and "loss" in output_dict assert "followup" in output_dict and "yesno" in output_dict def test_model_can_train_save_and_load(self): self.ensure_model_can_train_save_and_load(self.param_file, tolerance=1e-4) def test_batch_predictions_are_consistent(self): self.ensure_batch_predictions_are_consistent()
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size * 30): yield Batch(batch_instances)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: if self.counter is None: self.build_counter(instances) # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): batch_instances = self.modify_batch_instances(batch_instances) for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batch = Batch(possibly_smaller_batches) yield batch if excess: yield Batch(excess)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: """ As you can see, we don't shuffle our objects here. """ # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): iterator = iter(instance_list) # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): yield Batch(batch_instances)
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def predict_instance(self, instances: Tuple[Instance, Instance], num_samples: int = 100) -> JsonDict: conditioning_instance, generative_instance = instances self._model.eval() with torch.no_grad(): # TODO: Make this a parameter somewhere num_samples = num_samples # Duplicate instances (to sample in parallel) cuda_device = self._model._get_prediction_device() conditioning_batch = Batch([conditioning_instance] * num_samples) conditioning_batch.index_instances(self._model.vocab) conditioning_batch = util.move_to_device( conditioning_batch.as_tensor_dict(), cuda_device) generative_batch = Batch([generative_instance] * num_samples) generative_batch.index_instances(self._model.vocab) generative_batch = util.move_to_device( generative_batch.as_tensor_dict(), cuda_device) # Sample annotations and generate next token self._model._use_shortlist = True conditioning_output = self._model.sample(**conditioning_batch, emit_tokens=False) logger.debug('clears condition generation') # self._model(**conditioning_output) # Shouldn't need to do this, but just in case # logger.debug('clears reconditioning') generative_output = self._model.sample(**generative_batch, emit_tokens=True) logger.debug('clears generation') del conditioning_batch, generative_batch aggregate_word_probs = self._aggregate_word_probs( generative_output) logger.debug('clears word probs') return aggregate_word_probs
def predict(model, dataset_reader, test_file, output_file, cuda_device): gold_test_data = load_json(test_file) instances = dataset_reader.read(test_file) batch = Batch(instances) batch.index_instances(model.vocab) iterator = BatchIterator() iterator._batch_size = 5 # For long documents, loop over batches of sentences. Keep track of the # total length and append onto the end of the predictions for each sentence # batch. assert len(gold_test_data) == 1 gold_data = gold_test_data[0] predictions = {} total_length = 0 for sents in iterator(batch.instances, num_epochs=1, shuffle=False): sents = nn_util.move_to_device(sents, cuda_device) # Put on GPU. sentence_lengths = [ len(entry["sentence"]) for entry in sents["metadata"] ] sentence_starts = np.cumsum(sentence_lengths) + total_length sentence_starts = np.roll(sentence_starts, 1) sentence_starts[0] = total_length pred = model(**sents) decoded = model.decode(pred) if total_length == 0: for k, v in decoded.items(): predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]], sentence_starts) else: for k, v in decoded.items(): predictions[decode_names[k]] += cleanup( k, v[decode_fields[k]], sentence_starts) total_length += sum(sentence_lengths) res = {} res.update(gold_data) res.update(predictions) check_lengths(res) encoded = json.dumps(res, default=int) with open(output_file, "w") as f: f.write(encoded + "\n")
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- instances : List[Instance], required The instances to run the model on. cuda_device : int, required The GPU device to use. -1 means use the CPU. Returns ------- A list of the models output for each instance. """ with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): output = output.detach().cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def _create_batches(self, instances, shuffle): # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances): batch = Batch(possibly_smaller_batches) yield batch
def prepare_batch(tokens_batch, vocab, indexers, args): """ Do preprocessing for batch """ instance_ls = [] token_ls = [] for tokens in tokens_batch: field = sentence_to_text_field(tokens, indexers) field.index(vocab) instance_ls.append(Instance({"input1": field})) token_ls.append(tokens) batch = Batch(instance_ls).as_tensor_dict() batch = move_to_device(batch, args.cuda) return batch, token_ls
def read_squad_word_char(file_path): token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars") } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print(len(word2idx)) print(len(char2idx)) print(char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print(padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print(tensor_dict['passage']['tokens'].shape) print(tensor_dict['passage']['chars'].shape) print(tensor_dict['question']['tokens'].shape) print(tensor_dict['question']['chars'].shape) print(tensor_dict['span_start'].shape) print(tensor_dict['span_end'].shape)
def remove_tokens(self, attentions, metadata, threshold, labels): attentions_cpu = attentions.cpu().data.numpy() sentences = [x["tokens"] for x in metadata] instances = [] for b in range(attentions_cpu.shape[0]): sentence = [x for x in sentences[b]] always_keep_mask = metadata[b]['always_keep_mask'] attn = attentions_cpu[b][:len(sentence )] + always_keep_mask * -10000 max_length = math.ceil((1 - always_keep_mask).sum() * threshold) top_ind = np.argsort(attn)[:-max_length] new_tokens = [ x for i, x in enumerate(sentence) if i in top_ind or always_keep_mask[i] == 1 ] instances += metadata[0]["convert_tokens_to_instance"](new_tokens, None) batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return { k: v.to(attentions.device) for k, v in batch["document"].items() }
def test_embeddings(self, transformer_name, gold_offsets: torch.LongTensor, use_starting_offsets): self.token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False, use_starting_offsets=use_starting_offsets) self.transformer_embedder = TransformerEmbedder(model_name=transformer_name, trainable=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" tokens0 = sent0.split() tokens1 = sent1.split() tokens0 = [Token(token) for token in tokens0] tokens1 = [Token(token) for token in tokens1] vocab = Vocabulary() instance0 = Instance({"tokens": TextField(tokens0, {"transformer": self.token_indexer})}) instance1 = Instance({"tokens": TextField(tokens1, {"transformer": self.token_indexer})}) batch = Batch([instance0, instance1]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] input_ids = tokens['transformer'] offsets = tokens['transformer-offsets'] transformer_mask = tokens['transformer-mask'] test_select_embeddings = self.transformer_embedder(input_ids, offsets, transformer_mask) transformer_vectors = self.transformer_embedder(token_ids=input_ids, mask=transformer_mask) gold_select_embeddings = get_select_embedding(transformer_vectors, gold_offsets) assert gold_select_embeddings.equal(test_select_embeddings)
def test_encode_decode_with_raw_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" vocab = Vocabulary() instance1 = Instance({ "tokens": TextField([Token(sent0)], {"transformer": token_indexer}) }) instance2 = Instance({ "tokens": TextField([Token(sent1)], {"transformer": token_indexer}) }) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] input_ids = tokens['transformer'] input_ids_0 = [id.item() for id in input_ids[0]] input_ids_1 = [id.item() for id in input_ids[1]] # 原句子应与indexer后的句子保持一致 assert sent0 == token_indexer.tokenizer.decode( input_ids_0, skip_special_tokens=True) assert sent1 == token_indexer.tokenizer.decode( input_ids_1, skip_special_tokens=True)
def instances_to_batch(instances, model, for_training, cuda_device=0): batch = Batch(instances) batch.index_instances(model.vocab) padding_lengths = batch.get_padding_lengths() return batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training)
def forward_on_instance(self, instance: SyncedFieldsInstance) -> Dict[str, str]: """ Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and remove the batch dimension. """ cuda_device = self._get_prediction_device() dataset = Batch([instance]) dataset.index_instances(self.vocab) gt_has_oov = False dataset_tensor_dict = dataset.as_tensor_dict() if self.OOV_ID in dataset_tensor_dict["target_tokens"]["ids_with_unks"]: gt_has_oov = True model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) output_ids = self.beam_search_decode(**model_input) output_words = [] for _id in output_ids: if _id<self.vocab_size: output_words.append(self.vocab.get_token_from_index(_id)) else: output_words.append(instance.oov_list[_id-self.vocab_size]) assert output_words[0]==START_SYMBOL, "somehow the first symbol is not the START symbol. might be a bug" output_words=output_words[1:] if output_words[-1]==END_SYMBOL: output_words = output_words[:-1] return " ".join(output_words)
def test_offsets_with_tokenized_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" sent0 = sent0.split() sent1 = sent1.split() tokens0 = [Token(token) for token in sent0] tokens1 = [Token(token) for token in sent1] vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens0, {"transformer": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens1, {"transformer": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 每个token应该只取一个sub_word代表作为token的特征 assert len(tokens['transformer-offsets'][0]) == len(tokens0) assert len(tokens['transformer-offsets'][1]) == len(tokens1)
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17] ] assert tokens["bert-offsets"].tolist() == [ [1, 2, 3, 4, 5, 6, 7, 8, 9] ]
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- instances : List[Instance], required The instances to run the model on. cuda_device : int, required The GPU device to use. -1 means use the CPU. Returns ------- A list of the models output for each instance. """ with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output = [] metadata = [ x.fields["metadata"].metadata for x in dataset.instances ] for res in export_output_data_arc_multi_choice_json( metadata, outputs): instance_separated_output.append(res) return instance_separated_output
def read_squad_allennlp(file_path): '''read data, build vocab, batch, padding, to idx Args: file_path -- raw squad json file Returns: None ''' token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars")} reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) for instance in instances: question = instance.fields['question'] print (question) print (type(question)) break vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print (len(word2idx)) print (len(char2idx)) print (char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print (padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print (tensor_dict['passage']['tokens'].shape) print (tensor_dict['passage']['chars'].shape) print (tensor_dict['question']['tokens'].shape) print (tensor_dict['question']['chars'].shape) print (tensor_dict['span_start'].shape) print (tensor_dict['span_end'].shape)
def predict(instances: List[Instance]) -> List[float]: """Output BERT NSP next sentence probability for a list of instances. Parameters ---------- instances : List[Instance] Returns ------- List[float] BERT NSP scores in range [0, 1]. """ scores = [] for batch_instance in tqdm(batch(instances, batch_size=args.batch_size), total=math.ceil( len(instances) / args.batch_size), desc='Predicting'): batch_ins = Batch(batch_instance) batch_ins.index_instances(VOCAB) tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths()) tokens = tensor_dict["tokens"] input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}')) token_type_ids = tokens['bert-type-ids'].to( torch.device(f'cuda:{GPU_ID}')) input_mask = (input_ids != 0).long() cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask) probs = F.softmax(cls_out, dim=-1) next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist() scores += next_sentence_score return scores
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def predict(archive_file, test_file, output_file, cuda_device, score_dir): import_submodules("dygie") gold_test_data = load_json(test_file) archive = load_archive(archive_file, cuda_device) model = archive.model model.eval() config = archive.config.duplicate() dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) instances = dataset_reader.read(test_file) batch = Batch(instances) batch.index_instances(model.vocab) iterator = DocumentIterator() with open(output_file, "w") as f: for doc, gold_data in zip( iterator(batch.instances, num_epochs=1, shuffle=False), gold_test_data): doc = nn_util.move_to_device(doc, cuda_device) # Put on GPU. sentence_lengths = [ len(entry["sentence"]) for entry in doc["metadata"] ] sentence_starts = np.cumsum(sentence_lengths) sentence_starts = np.roll(sentence_starts, 1) sentence_starts[0] = 0 pred = model(**doc) if score_dir is not None: dump_scores(doc, pred, score_dir) decoded = model.decode(pred) predictions = {} for k, v in decoded.items(): predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]], sentence_starts) res = {} res.update(gold_data) res.update(predictions) if "dataset" in res: del res["dataset"] check_lengths(res) encoded = json.dumps(res, default=int) f.write(encoded + "\n")
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): instance_list = [ ins for ins in instance_list if random.random() < ins['metadata'].metadata['keep_prob'] ] if len(self._sorting_keys) == 0: instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] excess: Deque[Instance] = deque() for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batches.append(Batch(possibly_smaller_batches)) if excess: batches.append(Batch(excess)) # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group # num_gpu batches together, shuffle and then expand the groups. # This guards against imbalanced batches across GPUs. move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: # NOTE: if shuffle is false, the data will still be in a different order # because of the bucket sorting. random.shuffle(batches) if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) yield from batches
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def _regenerate_tokens(self, metadata, sample_z): sample_z_cpu = sample_z.cpu().data.numpy() tokens = [m["tokens"] for m in metadata] assert len(tokens) == len(sample_z_cpu) assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1] instances = [] new_tokens = [] for words, mask, meta in zip(tokens, sample_z_cpu, metadata): mask = mask[:len(words)] new_words = [ w for i, (w, m) in enumerate(zip(words, mask)) if i == 0 or m == 1 ] new_tokens.append(new_words) meta["new_tokens"] = new_tokens instance = metadata[0]["convert_tokens_to_instance"](new_words, None) instances += instance batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: instances = ensure_list(instances) instances_len = len(instances) num_batches = math.floor(instances_len / self._batch_size) # want all batches to be the same size stop = instances_len - instances_len % self._batch_size for batch_ind in range(num_batches): yield Batch(instances[batch_ind:stop:num_batches])
def test_forward_pass_runs_correctly(self): batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() output_dict = self.model(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics['f1'] > 0 span_start_probs = output_dict['span_start_probs'][0].data.numpy() span_end_probs = output_dict['span_start_probs'][0].data.numpy() assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6) assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6) span_start, span_end = tuple(output_dict['best_span'][0].data.numpy()) assert span_start >= 0 assert span_start <= span_end assert span_end < self.instances[0].fields['passage'].sequence_length() assert isinstance(output_dict['best_span_str'][0], str)
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def ensure_batch_predictions_are_consistent( self, keys_to_ignore: Iterable[str] = ()): """ Ensures that the model performs the same on a batch of instances as on individual instances. Ignores metrics matching the regexp .*loss.* and those specified explicitly. Parameters ---------- keys_to_ignore : ``Iterable[str]``, optional (default=()) Names of metrics that should not be taken into account, e.g. "batch_weight". """ self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if 'loss' in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue if key in keys_to_ignore: continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
class ModelTestCase(AllenNlpTestCase): """ A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase` with added methods for testing :class:`~allennlp.models.model.Model` subclasses. """ def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = list(reader.read(dataset_file)) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab) def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None: if isinstance(field1, torch.Tensor): assert_allclose(field1.detach().cpu().numpy(), field2.detach().cpu().numpy(), rtol=tolerance, err_msg=name) elif isinstance(field1, dict): assert field1.keys() == field2.keys() for key in field1: self.assert_fields_equal(field1[key], field2[key], tolerance=tolerance, name=name + '.' + str(key)) elif isinstance(field1, (list, tuple)): assert len(field1) == len(field2) for i, (subfield1, subfield2) in enumerate(zip(field1, field2)): self.assert_fields_equal(subfield1, subfield2, tolerance=tolerance, name=name + f"[{i}]") elif isinstance(field1, (float, int)): assert_allclose([field1], [field2], rtol=tolerance, err_msg=name) else: if field1 != field2: for key in field1.__dict__: print(key, getattr(field1, key) == getattr(field2, key)) assert field1 == field2, f"{name}, {type(field1)}, {type(field2)}" @staticmethod def check_model_computes_gradients_correctly(model: Model, model_batch: Dict[str, Union[Any, Dict[str, Any]]], params_to_ignore: Set[str] = None): print("Checking gradients") model.zero_grad() result = model(**model_batch) result["loss"].backward() has_zero_or_none_grads = {} for name, parameter in model.named_parameters(): zeros = torch.zeros(parameter.size()) if params_to_ignore and name in params_to_ignore: continue if parameter.requires_grad: if parameter.grad is None: has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)" elif parameter.grad.is_sparse or parameter.grad.data.is_sparse: pass # Some parameters will only be partially updated, # like embeddings, so we just check that any gradient is non-zero. elif (parameter.grad.cpu() == zeros).all(): has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})" else: assert parameter.grad is None if has_zero_or_none_grads: for name, grad in has_zero_or_none_grads.items(): print(f"Parameter: {name} had incorrect gradient: {grad}") raise Exception("Incorrect gradients found. See stdout for more info.") def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if 'loss' in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
if (create_video_training): pf.create_image_weights_epoch(model, video_fotograms_folder2, i) pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset, tr_data_loss, val_data_loss, KL_loss, video_fotograms_folder4, i+1) # output = model(tensor_dict["text_field"],tensor_dict["tags_field"]) # loss = output["loss"] # We can get the loss coz we gave the labels as input # gradient and everything. """ ############## Use the trained model ###################### We use an already implemented predictor that takes the model and how to preprocess the data """ name_exmaple = "Eat my motherfucking jeans" name_exmaple = "Carlos Sanchez" tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))] Instance_test = reader.generate_instance(tokens_list,None) batch = Batch([Instance_test]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) model.eval() tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy() tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
return field # Create the instance with the ELMO field instances = [] for sentence in sentences: ## We tokenize every word. field = get_ELMO_text_field(sentence, indexer, tokenizer) instance = Instance({"elmo": field}) print("Fields in instance: ", instance.fields) instances.append(instance) ### Create a batch of the instances dataset = Batch(instances) ## Create an empty vocabulary ! We do not need to create one from dataset, # It will use all of the indexer !! vocab = Vocabulary() ## Create the index_instances from the batch, this will be used later by ELMO dataset.index_instances(vocab) """ IMPORTANT: The ELMO uses just a character vocab in the interface. It will compute the rest internally! The ELMO words are padded to length 50 ! """
class ModelTestCase(AllenNlpTestCase): """ A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase` with added methods for testing :class:`~allennlp.models.model.Model` subclasses. """ def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab) def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None: if isinstance(field1, torch.autograd.Variable): assert_allclose(field1.data.cpu().numpy(), field2.data.cpu().numpy(), rtol=tolerance, err_msg=name) elif isinstance(field1, dict): assert field1.keys() == field2.keys() for key in field1: self.assert_fields_equal(field1[key], field2[key], tolerance=tolerance, name=name + '.' + key) elif isinstance(field1, (list, tuple)): assert len(field1) == len(field2) for i, (subfield1, subfield2) in enumerate(zip(field1, field2)): self.assert_fields_equal(subfield1, subfield2, tolerance=tolerance, name=name + f"[{i}]") else: assert field1 == field2 @staticmethod def check_model_computes_gradients_correctly(model, model_batch): model.zero_grad() result = model(**model_batch) result["loss"].backward() has_zero_or_none_grads = {} for name, parameter in model.named_parameters(): zeros = torch.zeros(parameter.size()) if parameter.requires_grad: if parameter.grad is None: has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)" # Some parameters will only be partially updated, # like embeddings, so we just check that any gradient is non-zero. if (parameter.grad.data.cpu() == zeros).all(): has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})" else: assert parameter.grad is None if has_zero_or_none_grads: for name, grad in has_zero_or_none_grads.items(): print(f"Parameter: {name} had incorrect gradient: {grad}") raise Exception("Incorrect gradients found. See stdout for more info.") def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5}, "text2": {"num_tokens": 6, "tokens_length": 6}}
question_text = "What kind of test succeeded on its first attempt?" char_spans = [(6, 10)] instance = squad_reader.text_to_instance(question_text, passage_text, char_spans = char_spans) print ("Keys instance: ", instance.fields.keys()) # Batch intances and convert to index using the vocabulary. instances = [instance] else: instances = [train_dataset[0],train_dataset[1]] ## Create the batch ready to be used dataset = Batch(instances) dataset.index_instances(vocab) print ("-------------- DATASET EXAMPLE ---------------") character_ids_passage = dataset.as_tensor_dict()['passage']['character_ids'] character_ids_question = dataset.as_tensor_dict()['question']['character_ids'] question = dataset.as_tensor_dict()['question'] passage = dataset.as_tensor_dict()['passage'] span_start = dataset.as_tensor_dict()['span_start'] span_end = dataset.as_tensor_dict()['span_end'] metadata = dataset.as_tensor_dict()['metadata'] print ("Shape of characters ids passage: ", character_ids_passage.shape) print ("Shape of characters ids question: ", character_ids_question.shape)
def _get_training_tensors(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) return dataset.as_tensor_dict()
dataset_reader = DatasetReader.from_params(dataset_reader_params) ## Vocabulary ## vocab = model.vocab """ ############ Propagate an instance text ############# """ instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", "One time I was writing a unit test, and it succeeded on the first attempt.", char_spans=[(6, 10)]) print ("Keys instance: ", instance.fields.keys()) # Batch intances and convert to index using the vocabulary. instances = [instance] dataset = Batch(instances) dataset.index_instances(model.vocab) # Create the index tensor from the vocabulary. cuda_device = model._get_prediction_device() model_input = dataset.as_tensor_dict(cuda_device=cuda_device) # Propagate the sample and obtain the loss (since we passed labels) outputs = model(**model_input) outputs["loss"].requires_grad