Exemple #1
0
  def make_evaluation_dataset(self,
                              features_file,
                              labels_file,
                              batch_size,
                              num_threads=1,
                              prefetch_buffer_size=None):
    """Builds a dataset to be used for evaluation.

    Args:
      features_file: The evaluation source file.
      labels_file: The evaluation target file.
      batch_size: The batch size to use.
      num_threads: The number of elements processed in parallel.
      prefetch_buffer_size: The number of batches to prefetch asynchronously. If
        ``None``, use an automatically tuned value.

    Returns:
      A ``tf.data.Dataset``.

    See Also:
      :func:`opennmt.data.inference_pipeline`
    """
    map_func = lambda *arg: self.make_features(element=arg, training=False)
    dataset = self.make_dataset([features_file, labels_file], training=False)
    dataset = dataset.apply(dataset_util.inference_pipeline(
        batch_size,
        process_fn=map_func,
        num_threads=num_threads,
        prefetch_buffer_size=prefetch_buffer_size))
    return dataset
Exemple #2
0
    def make_evaluation_dataset(
        self,
        features_file,
        labels_file,
        batch_size,
        batch_type="examples",
        length_bucket_width=None,
        num_threads=1,
        prefetch_buffer_size=None,
    ):
        """Builds a dataset to be used for evaluation.

        Args:
          features_file: The evaluation source file.
          labels_file: The evaluation target file.
          batch_size: The batch size to use.
          batch_type: The batching strategy to use: can be "examples" or "tokens".
          length_bucket_width: The width of the length buckets to select batch
            candidates from (for efficiency). Set ``None`` to not constrain batch
            formation.
          num_threads: The number of elements processed in parallel.
          prefetch_buffer_size: The number of batches to prefetch asynchronously. If
            ``None``, use an automatically tuned value.

        Returns:
          A ``tf.data.Dataset``.

        See Also:
          :func:`opennmt.data.inference_pipeline`
        """
        if labels_file is not None:
            data_files = [features_file, labels_file]
            length_fn = [
                self.features_inputter.get_length,
                self.labels_inputter.get_length,
            ]
        else:
            data_files = features_file
            length_fn = self.get_length

        map_fn = lambda *arg: self.make_features(
            element=misc.item_or_tuple(arg), training=False)
        transform_fns = [
            lambda dataset: dataset.map(map_fn,
                                        num_parallel_calls=num_threads or 1)
        ]

        dataset = self.make_dataset(data_files, training=False)
        dataset = dataset.apply(
            dataset_util.inference_pipeline(
                batch_size,
                batch_type=batch_type,
                transform_fns=transform_fns,
                length_bucket_width=length_bucket_width,
                length_fn=length_fn,
                num_threads=num_threads,
                prefetch_buffer_size=prefetch_buffer_size,
            ))
        return dataset
Exemple #3
0
    def make_inference_dataset(
        self,
        features_file,
        batch_size,
        batch_type="examples",
        length_bucket_width=None,
        num_threads=1,
        prefetch_buffer_size=None,
    ):
        """Builds a dataset to be used for inference.

        For evaluation and training datasets, see
        :class:`opennmt.inputters.ExampleInputter`.

        Args:
          features_file: The test file.
          batch_size: The batch size to use.
          batch_type: The batching strategy to use: can be "examples" or "tokens".
          length_bucket_width: The width of the length buckets to select batch
            candidates from (for efficiency). Set ``None`` to not constrain batch
            formation.
          num_threads: The number of elements processed in parallel.
          prefetch_buffer_size: The number of batches to prefetch asynchronously. If
            ``None``, use an automatically tuned value.

        Returns:
          A ``tf.data.Dataset``.

        See Also:
          :func:`opennmt.data.inference_pipeline`
        """
        def _map_fn(*arg):
            features = self.make_features(element=misc.item_or_tuple(arg),
                                          training=False)
            if isinstance(features, (list, tuple)):
                # Special case for unsupervised inputters that always return a
                # tuple (features, labels).
                return features[0]
            return features

        transform_fns = [
            lambda dataset: dataset.map(_map_fn,
                                        num_parallel_calls=num_threads or 1)
        ]

        dataset = self.make_dataset(features_file, training=False)
        dataset = dataset.apply(
            dataset_util.inference_pipeline(
                batch_size,
                batch_type=batch_type,
                transform_fns=transform_fns,
                length_bucket_width=length_bucket_width,
                length_fn=self.get_length,
                num_threads=num_threads,
                prefetch_buffer_size=prefetch_buffer_size,
            ))
        return dataset
  def testReorderInferDataset(self):
    dataset = tf.data.Dataset.from_tensor_slices([8, 2, 5, 6, 7, 1, 3, 9])
    dataset = dataset.map(lambda x: {"length": x})
    dataset = dataset.apply(dataset_util.inference_pipeline(
        3, length_bucket_width=3, length_fn=lambda x: x["length"]))
    elements = list(iter(dataset))

    def _check_element(element, length, index):
      self.assertAllEqual(element["length"], length)
      self.assertAllEqual(element["index"], index)

    self.assertEqual(len(elements), 4)
    _check_element(elements[0], [8, 6, 7], [0, 3, 4])
    _check_element(elements[1], [2, 1], [1, 5])
    _check_element(elements[2], [5, 3], [2, 6])
    _check_element(elements[3], [9], [7])
Exemple #5
0
 def make_evaluation_dataset(self,
                             features_file,
                             labels_file,
                             batch_size,
                             num_threads=1,
                             prefetch_buffer_size=None):
     """See :meth:`opennmt.inputters.ExampleInputter.make_evaluation_dataset`."""
     _ = labels_file
     dataset = self.make_dataset(features_file, training=False)
     dataset = dataset.apply(
         dataset_util.inference_pipeline(
             batch_size,
             process_fn=lambda x: self.make_features(x, training=False),
             num_threads=num_threads,
             prefetch_buffer_size=prefetch_buffer_size))
     return dataset
Exemple #6
0
 def make_inference_dataset(self,
                            features_file,
                            batch_size,
                            length_bucket_width=None,
                            num_threads=1,
                            prefetch_buffer_size=None):
     dataset = self.make_dataset(features_file, training=False)
     dataset = dataset.apply(
         dataset_util.inference_pipeline(
             batch_size,
             process_fn=lambda x: self.make_features(x, training=False)[0],
             length_bucket_width=length_bucket_width,
             length_fn=self.get_length,
             num_threads=num_threads,
             prefetch_buffer_size=prefetch_buffer_size))
     return dataset
Exemple #7
0
    def make_inference_dataset(self,
                               features_file,
                               batch_size,
                               length_bucket_width=None,
                               num_threads=1,
                               prefetch_buffer_size=None):
        """Builds a dataset to be used for inference.

    For evaluation and training datasets, see
    :class:`opennmt.inputters.ExampleInputter`.

    Args:
      features_file: The test file.
      batch_size: The batch size to use.
      length_bucket_width: The width of the length buckets to select batch
        candidates from (for efficiency). Set ``None`` to not constrain batch
        formation.
      num_threads: The number of elements processed in parallel.
      prefetch_buffer_size: The number of batches to prefetch asynchronously. If
        ``None``, use an automatically tuned value.

    Returns:
      A ``tf.data.Dataset``.

    See Also:
      :func:`opennmt.data.inference_pipeline`
    """
        map_func = lambda *arg: self.make_features(
            element=misc.item_or_tuple(arg), training=False)
        transform_fns = [
            lambda dataset: dataset.map(map_func,
                                        num_parallel_calls=num_threads or 4)
        ]
        dataset = self.make_dataset(features_file, training=False)
        dataset = dataset.apply(
            dataset_util.inference_pipeline(
                batch_size,
                transform_fns=transform_fns,
                length_bucket_width=length_bucket_width,
                length_fn=self.get_length,
                num_threads=num_threads,
                prefetch_buffer_size=prefetch_buffer_size))
        return dataset
Exemple #8
0
 def make_evaluation_dataset(self,
                             features_file,
                             labels_file,
                             batch_size,
                             num_threads=1,
                             prefetch_buffer_size=None):
   """See :meth:`opennmt.inputters.ExampleInputter.make_evaluation_dataset`."""
   _ = labels_file
   dataset = self.make_dataset(features_file, training=False)
   map_func = lambda x: self.make_features(element=x, training=False)
   transform_fns = [lambda dataset:
                    dataset.map(map_func,
                                num_parallel_calls=num_threads or 4)]
   dataset = dataset.apply(dataset_util.inference_pipeline(
       batch_size,
       transform_fns=transform_fns,
       num_threads=num_threads,
       prefetch_buffer_size=prefetch_buffer_size))
   return dataset
Exemple #9
0
 def make_inference_dataset(self,
                            features_file,
                            batch_size,
                            length_bucket_width=None,
                            num_threads=1,
                            prefetch_buffer_size=None):
   dataset = self.make_dataset(features_file, training=False)
   map_func = lambda x: self.make_features(element=x, training=False)[0]
   transform_fns = [lambda dataset:
                    dataset.map(map_func,
                                num_parallel_calls=num_threads or 4)]
   dataset = dataset.apply(dataset_util.inference_pipeline(
       batch_size,
       transform_fns=transform_fns,
       length_bucket_width=length_bucket_width,
       length_fn=self.get_length,
       num_threads=num_threads,
       prefetch_buffer_size=prefetch_buffer_size))
   return dataset
Exemple #10
0
    def make_evaluation_dataset(self,
                                features_file,
                                labels_file,
                                batch_size,
                                num_threads=1,
                                prefetch_buffer_size=None):
        """Builds a dataset to be used for evaluation.

    Args:
      features_file: The evaluation source file.
      labels_file: The evaluation target file.
      batch_size: The batch size to use.
      num_threads: The number of elements processed in parallel.
      prefetch_buffer_size: The number of batches to prefetch asynchronously. If
        ``None``, use an automatically tuned value.

    Returns:
      A ``tf.data.Dataset``.

    See Also:
      :func:`opennmt.data.inference_pipeline`
    """
        if labels_file is not None:
            data_files = [features_file, labels_file]
        else:
            data_files = features_file

        map_fn = lambda *arg: self.make_features(
            element=misc.item_or_tuple(arg), training=False)
        transform_fns = [
            lambda dataset: dataset.map(map_fn,
                                        num_parallel_calls=num_threads or 1)
        ]

        dataset = self.make_dataset(data_files, training=False)
        dataset = dataset.apply(
            dataset_util.inference_pipeline(
                batch_size,
                transform_fns=transform_fns,
                num_threads=num_threads,
                prefetch_buffer_size=prefetch_buffer_size))
        return dataset