Ejemplo n.º 1
0
    def _testFilterByLength(
        self,
        features_length,
        labels_length,
        maximum_features_length=None,
        maximum_labels_length=None,
        filtered=True,
    ):
        dataset = tf.data.Dataset.zip(
            (
                tf.data.Dataset.from_tensors(tf.constant(features_length)),
                tf.data.Dataset.from_tensors(tf.constant(labels_length)),
            )
        )
        dataset = dataset.apply(
            dataset_util.filter_examples_by_length(
                maximum_features_length=maximum_features_length,
                maximum_labels_length=maximum_labels_length,
                features_length_fn=lambda _: features_length,
                labels_length_fn=lambda _: labels_length,
            )
        )

        iterator = iter(dataset)
        if filtered:
            with self.assertRaises(StopIteration):
                next(iterator)
        else:
            next(iterator)
Ejemplo n.º 2
0
 def make_training_dataset(self,
                           features_file,
                           labels_file,
                           batch_size,
                           batch_type="examples",
                           batch_multiplier=1,
                           batch_size_multiple=1,
                           shuffle_buffer_size=None,
                           length_bucket_width=None,
                           maximum_features_length=None,
                           maximum_labels_length=None,
                           single_pass=False,
                           num_shards=1,
                           shard_index=0,
                           num_threads=4,
                           prefetch_buffer_size=None,
                           cardinality_multiple=1,
                           weights=None):
   """See :meth:`opennmt.inputters.ExampleInputter.make_training_dataset`."""
   _ = labels_file
   dataset = self.make_dataset(features_file, training=True)
   if weights is not None:
     dataset = (dataset, weights)
   transform_fns = []
   map_func = lambda x: self.make_features(element=x, training=True)
   transform_fns.append(lambda dataset:
                        dataset.map(map_func,
                                    num_parallel_calls=num_threads or 4))
   transform_fns.append(dataset_util.filter_examples_by_length(
       maximum_features_length=maximum_features_length,
       maximum_labels_length=maximum_labels_length,
       features_length_fn=self.get_length))
   dataset = dataset_util.training_pipeline(
       batch_size,
       batch_type=batch_type,
       batch_multiplier=batch_multiplier,
       batch_size_multiple=batch_size_multiple,
       transform_fns=transform_fns,
       length_bucket_width=length_bucket_width,
       single_pass=single_pass,
       num_shards=num_shards,
       shard_index=shard_index,
       num_threads=num_threads,
       shuffle_buffer_size=shuffle_buffer_size,
       prefetch_buffer_size=prefetch_buffer_size,
       cardinality_multiple=cardinality_multiple)(dataset)
   return dataset
Ejemplo n.º 3
0
    def make_training_dataset(self,
                              features_file,
                              labels_file,
                              batch_size,
                              batch_type="examples",
                              batch_multiplier=1,
                              batch_size_multiple=1,
                              shuffle_buffer_size=None,
                              length_bucket_width=None,
                              maximum_features_length=None,
                              maximum_labels_length=None,
                              single_pass=False,
                              num_shards=1,
                              shard_index=0,
                              num_threads=4,
                              prefetch_buffer_size=None,
                              cardinality_multiple=1,
                              weights=None):
        """Builds a dataset to be used for training. It supports the full training
    pipeline, including:

    * sharding
    * shuffling
    * filtering
    * bucketing
    * prefetching

    Args:
      features_file: The source file or a list of training source files.
      labels_file: The target file or a list of training target files.
      batch_size: The batch size to use.
      batch_type: The training batching stragety to use: can be "examples" or
        "tokens".
      batch_multiplier: The batch size multiplier to prepare splitting accross
         replicated graph parts.
      batch_size_multiple: When :obj:`batch_type` is "tokens", ensure that the
        resulting batch size is a multiple of this value.
      shuffle_buffer_size: The number of elements from which to sample.
      length_bucket_width: The width of the length buckets to select batch
        candidates from (for efficiency). Set ``None`` to not constrain batch
        formation.
      maximum_features_length: The maximum length or list of maximum lengths of
        the features sequence(s). ``None`` to not constrain the length.
      maximum_labels_length: The maximum length of the labels sequence.
        ``None`` to not constrain the length.
      single_pass: If ``True``, makes a single pass over the training data.
      num_shards: The number of data shards (usually the number of workers in a
        distributed setting).
      shard_index: The shard index this data pipeline should read from.
      num_threads: The number of elements processed in parallel.
      prefetch_buffer_size: The number of batches to prefetch asynchronously. If
        ``None``, use an automatically tuned value.
      cardinality_multiple: Ensure that the dataset cardinality is a multiple of
        this value when :obj:`single_pass` is ``True``.
      weights: An optional list of weights to create a weighted dataset out of
        multiple training files.

    Returns:
      A ``tf.data.Dataset``.

    See Also:
      :func:`opennmt.data.training_pipeline`
    """
        dataset = self.make_dataset([features_file, labels_file],
                                    training=True)
        if weights is not None:
            dataset = (dataset, weights)
        transform_fns = []
        map_func = lambda *arg: self.make_features(element=arg, training=True)
        transform_fns.append(lambda dataset: dataset.map(
            map_func, num_parallel_calls=num_threads or 4))
        transform_fns.append(
            dataset_util.filter_examples_by_length(
                maximum_features_length=maximum_features_length,
                maximum_labels_length=maximum_labels_length,
                features_length_fn=self.features_inputter.get_length,
                labels_length_fn=self.labels_inputter.get_length))
        dataset = dataset_util.training_pipeline(
            batch_size,
            batch_type=batch_type,
            batch_multiplier=batch_multiplier,
            batch_size_multiple=batch_size_multiple,
            transform_fns=transform_fns,
            length_bucket_width=length_bucket_width,
            single_pass=single_pass,
            num_shards=num_shards,
            shard_index=shard_index,
            num_threads=num_threads,
            shuffle_buffer_size=shuffle_buffer_size,
            prefetch_buffer_size=prefetch_buffer_size,
            cardinality_multiple=cardinality_multiple)(dataset)
        return dataset