Beispiel #1
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         yield batch.as_array_dict(padding_lengths, verbose=False)
Beispiel #2
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool,
                      cuda_device: int, for_training: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         yield batch.as_tensor_dict(padding_lengths,
                                    cuda_device=cuda_device,
                                    for_training=for_training)
Beispiel #3
0
    def test_lazy_as_tensor_dict(self):
        lazy_dataset = self.get_lazy_dataset()
        lazy_dataset.index_instances(self.vocab)

        for _ in range(10):
            dataset = Dataset([instance for instance in lazy_dataset])
            padding_lengths = dataset.get_padding_lengths()
            tensors = dataset.as_tensor_dict(padding_lengths)
            text1 = tensors["text1"]["tokens"].data.cpu().numpy()
            text2 = tensors["text2"]["tokens"].data.cpu().numpy()

            numpy.testing.assert_array_almost_equal(
                text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
            numpy.testing.assert_array_almost_equal(
                text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
Beispiel #4
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.dataset.instances):
         dataset = Dataset([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Dataset([instance for instance in self.dataset])
     batch_tensors = full_dataset.as_tensor_dict(self.dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     # This is probably a sequence model, and our output shape has some padded
                     # elements in the batched case.  Fixing this in general is complicated;
                     # we'll just fix some easy cases that we actually have, for now.
                     num_tokens = single_predicted.size(0)
                     if batch_predicted.dim() == 1:
                         batch_predicted = batch_predicted[:num_tokens]
                     elif batch_predicted.dim() == 2:
                         batch_predicted = batch_predicted[:num_tokens, :]
                     else:
                         raise NotImplementedError
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Beispiel #5
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         yield batch.as_arrays(padding_lengths, verbose=False)