def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Dataset(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def token_to_elmo_id(token): tokens = [Token(token)] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances = [instance] dataset = Dataset(instances) vocab = Vocabulary() for instance in dataset.instances: instance.index_fields(vocab) #dataset.index_instances(vocab) # replaced by above, so that there's no progress bar return dataset.as_tensor_dict()['elmo']['character_ids']
def _yield_one_epoch(self, dataset: Dataset, shuffle: bool, cuda_device: int, for_training: bool): grouped_instances = self._create_batches(dataset, shuffle) for group in grouped_instances: batch = Dataset(group) padding_lengths = batch.get_padding_lengths() logger.debug("Batch padding lengths: %s", str(padding_lengths)) logger.debug("Batch size: %d", len(batch.instances)) yield batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training)
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.dataset.instances): dataset = Dataset([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Dataset([instance for instance in self.dataset]) batch_tensors = full_dataset.as_tensor_dict(self.dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): # This is probably a sequence model, and our output shape has some padded # elements in the batched case. Fixing this in general is complicated; # we'll just fix some easy cases that we actually have, for now. num_tokens = single_predicted.size(0) if batch_predicted.dim() == 1: batch_predicted = batch_predicted[:num_tokens] elif batch_predicted.dim() == 2: batch_predicted = batch_predicted[:num_tokens, :] else: raise NotImplementedError assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_lazy_as_tensor_dict(self): lazy_dataset = self.get_lazy_dataset() lazy_dataset.index_instances(self.vocab) for _ in range(10): dataset = Dataset([instance for instance in lazy_dataset]) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].data.cpu().numpy() text2 = tensors["text2"]["tokens"].data.cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']