Exemple #1
0
  def testBatchAndDropRemainder(self):
    components = (np.arange(7),
                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                  np.array(37.0) * np.arange(7))

    batch_size = array_ops.placeholder(dtypes.int64, shape=[])

    iterator = (
        dataset_ops.Dataset.from_tensor_slices(components).apply(
            batching.batch_and_drop_remainder(batch_size))
        .make_initializable_iterator())

    next_element = iterator.get_next()

    with self.test_session() as sess:
      for test_batch_size in [1, 3, 7, 10]:
        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
        num_batches = 7 // test_batch_size
        for i in range(num_batches):
          result = sess.run(next_element)
          for component, result_component in zip(components, result):
            for j in range(test_batch_size):
              self.assertAllEqual(component[(i * test_batch_size + j)],
                                  result_component[j])
        with self.assertRaises(errors.OutOfRangeError):
          sess.run(next_element)
  def testBatchAndDropRemainder(self):
    components = (np.arange(7),
                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                  np.array(37.0) * np.arange(7))

    batch_size = array_ops.placeholder(dtypes.int64, shape=[])

    iterator = (
        dataset_ops.Dataset.from_tensor_slices(components).apply(
            batching.batch_and_drop_remainder(batch_size))
        .make_initializable_iterator())

    next_element = iterator.get_next()

    with self.cached_session() as sess:
      for test_batch_size in [1, 3, 7, 10]:
        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
        num_batches = 7 // test_batch_size
        for i in range(num_batches):
          result = sess.run(next_element)
          for component, result_component in zip(components, result):
            for j in range(test_batch_size):
              self.assertAllEqual(component[(i * test_batch_size + j)],
                                  result_component[j])
        with self.assertRaises(errors.OutOfRangeError):
          sess.run(next_element)
Exemple #3
0
    def testBatchAndDropRemainderSparse(self):
        def _sparse(i):
            return sparse_tensor.SparseTensor(indices=[[0]],
                                              values=(i * [1]),
                                              dense_shape=[1])

        iterator = dataset_ops.Dataset.range(12).map(_sparse).apply(
            batching.batch_and_drop_remainder(
                5)).make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(2):
                actual = sess.run(get_next)
                expected = sparse_tensor.SparseTensor(
                    indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
                    values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
                    dense_shape=[5, 1])
                self.assertTrue(
                    isinstance(actual, sparse_tensor.SparseTensorValue))
                self.assertSparseValuesEqual(actual, expected.eval())
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
  def testBatchAndDropRemainderSparseError(self):

    def _map_fn(i):
      return sparse_tensor.SparseTensor(
          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i

    with self.assertRaises(TypeError):
      _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
          batching.batch_and_drop_remainder(10))
    def testBatchAndDropRemainderSparseError(self):
        def _map_fn(i):
            return sparse_tensor.SparseTensor(indices=[[0, 0]],
                                              values=(i * [1]),
                                              dense_shape=[1, 1]), i

        with self.assertRaises(TypeError):
            _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
                batching.batch_and_drop_remainder(10))
  def testBatchAndDropRemainderShapeInference(self):
    components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder(
        dtypes.int32, shape=[None]), array_ops.placeholder(
            dtypes.int32, shape=[20, 30])))

    # Test with a statically known batch size.
    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
        batching.batch_and_drop_remainder(128)))

    self.assertIs(None, dataset.output_shapes[0].ndims)
    self.assertEqual([128], dataset.output_shapes[1][0].as_list())
    self.assertEqual([128, 30], dataset.output_shapes[1][1].as_list())

    # Test with a dynamic batch size: the static shape will be unknown, because
    # `batch_size` is a placeholder.
    batch_size = array_ops.placeholder(dtypes.int64)
    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
        batching.batch_and_drop_remainder(batch_size)))

    self.assertIs(None, dataset.output_shapes[0].ndims)
    self.assertEqual([None], dataset.output_shapes[1][0].as_list())
    self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
Exemple #7
0
    def testBatchAndDropRemainderShapeInference(self):
        components = (array_ops.placeholder(dtypes.int32),
                      (array_ops.placeholder(dtypes.int32, shape=[None]),
                       array_ops.placeholder(dtypes.int32, shape=[20, 30])))

        # Test with a statically known batch size.
        dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
            batching.batch_and_drop_remainder(128)))

        self.assertIs(None, dataset.output_shapes[0].ndims)
        self.assertEqual([128], dataset.output_shapes[1][0].as_list())
        self.assertEqual([128, 30], dataset.output_shapes[1][1].as_list())

        # Test with a dynamic batch size: the static shape will be unknown, because
        # `batch_size` is a placeholder.
        batch_size = array_ops.placeholder(dtypes.int64)
        dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
            batching.batch_and_drop_remainder(batch_size)))

        self.assertIs(None, dataset.output_shapes[0].ndims)
        self.assertEqual([None], dataset.output_shapes[1][0].as_list())
        self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
Exemple #8
0
    def _apply_fn(dataset):  # pylint: disable=missing-docstring
        random_ds = random_ops.RandomDataset(seed).apply(
            batching.batch_and_drop_remainder(2))
        if count is not None and count is not -1:
            random_ds = random_ds.take(count)

        def map_fn(seeds):
            return dataset_ops.ShuffleDataset(input_dataset=dataset,
                                              buffer_size=buffer_size,
                                              seed=seeds[0],
                                              reshuffle_each_iteration=False,
                                              seed2=seeds[1])

        return random_ds.flat_map(map_fn)
Exemple #9
0
  def __init__(self, dataset, devices, prefetch_on_device=None):
    self._devices = devices

    # Default to using prefetching in graph mode, unless specified.
    # TODO(priyag): Enable prefetching in eager mode.
    self._prefetch_on_device = prefetch_on_device
    if self._prefetch_on_device is None:
      self._prefetch_on_device = not context.executing_eagerly()
    assert not (self._prefetch_on_device and context.executing_eagerly()), (
        "Prefetching is only supported in graph mode currently")

    if self._prefetch_on_device:
      self._dataset = dataset
    else:
      # TODO(priyag): If dropping remainder is not appropriate, find another
      # approach to distributing the dataset when not possible to divide evenly.
      # Possibly not an issue when we start using PartitionedDataset.
      self._dataset = dataset.apply(
          batching.batch_and_drop_remainder(len(devices)))
  def testBatchAndDropRemainderSparse(self):

    def _sparse(i):
      return sparse_tensor.SparseTensorValue(
          indices=[[0]], values=(i * [1]), dense_shape=[1])

    iterator = dataset_ops.Dataset.range(12).map(_sparse).apply(
        batching.batch_and_drop_remainder(5)).make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      for i in range(2):
        actual = sess.run(get_next)
        expected = sparse_tensor.SparseTensorValue(
            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
            dense_shape=[5, 1])
        self.assertTrue(sparse_tensor.is_sparse(actual))
        self.assertSparseValuesEqual(actual, expected)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #11
0
def StreamingFilesDataset(files,
                          filetype=None,
                          file_reader_job=None,
                          worker_job=None,
                          num_epochs=None,
                          filename_shuffle_buffer_size=None,
                          num_parallel_reads=None,
                          batch_transfer_size=None,
                          sloppy=None):
    """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).

  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
  files local to your GCE VM. In order to train using files stored on your local
  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
  helper to generate a dataset to feed your Cloud TPU with files from your GCE
  VM.

  The resulting dataset may return an OutOfRangeError if there are no files
  found as a result of the fileglob expansion.

  Note: StreamingFilesDataset assumes that the session is using a
  TPUClusterResolver and has therefore a worker and a coordinator job. File
  loading will be done on the coordinator job.

  Args:
    files: A string glob to match files, or a `tf.data.Dataset` generating file
      names.
    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
      TensorFlow function that when given a filename returns a dataset.
    file_reader_job: An optional string that corresponds to the job that should
      perform the file reads.
    worker_job: An optional string that corresponds to the job that should
      process the tensors (i.e. your GPU or TPU worker).
    num_epochs: The number of epochs through the training set that should be
      generated. By default, it will repeat infinitely.
    filename_shuffle_buffer_size: An optional integer whose value controls the
      shuffling of the file names. If you would like to read from the files in
      the same order, set to 0 or False.
    num_parallel_reads: An optional integer controlling the number of files to
      read from concurrently. (Set to 1 for no parallelism.)
    batch_transfer_size: An optional integer controlling the batching used to
      amortize the remote function invocation overhead. Set to a very large
      number to increase throughput. Set to a very small number to reduce memory
      consumption. Set to False to skip batching.
    sloppy: (Optional.) If `True`, read input data as fast as possible, without
      maintaining a deterministic order. Defaults to `False`.
  Returns:
    A `tf.data.Dataset` with an infinite stream of elements generated by a
    parallel interleaving of the set of files matched (or generated) by `files`
    with a type is the output of the dataset specified by `filetype`.

  Raises:
    ValueError: if any argument is not of the expected type.
  """
    if filetype is None:
        filetype = 'tfrecord'

    if isinstance(filetype, str):
        if filetype not in _FILETYPE_MAP:
            raise ValueError('Unexpected filetype: %s' % filetype)
        reader_fn = _FILETYPE_MAP[filetype]
    elif callable(filetype):
        reader_fn = filetype
    else:
        raise ValueError('filetype should be a string or a callable')

    file_reader_job = file_reader_job or 'coordinator'

    worker_job = worker_job or 'worker'

    if filename_shuffle_buffer_size is None:
        filename_shuffle_buffer_size = 4096

    num_parallel_reads = num_parallel_reads or 8

    if batch_transfer_size is None:
        batch_transfer_size = 1024

    if sloppy is None:
        sloppy = False

    with ops.device('/job:%s' % file_reader_job):
        if isinstance(files, str):
            source_dataset = dataset_ops.Dataset.list_files(files)
        elif isinstance(files, dataset_ops.Dataset):
            source_dataset = files
        else:
            raise ValueError('files was not a string or a dataset: %s' % files)

        if filename_shuffle_buffer_size:
            source_dataset = source_dataset.shuffle(
                buffer_size=filename_shuffle_buffer_size)

        # NOTE: We perform the `repeat` on the source dataset, because the output
        # dataset does not currently have enough information to recreate an iterator
        # over the source dataset when it reaches the end.
        source_dataset = source_dataset.repeat(num_epochs)

        source_dataset = source_dataset.apply(
            interleave_ops.parallel_interleave(reader_fn,
                                               cycle_length=num_parallel_reads,
                                               sloppy=sloppy))

        if batch_transfer_size:
            # Note: we can safely call batch_and_drop_remainder because we have an
            # infinite stream of TFRecords.
            source_dataset = source_dataset.apply(
                batching.batch_and_drop_remainder(batch_transfer_size))

        source_dataset = source_dataset.prefetch(1)

        source_iterator = source_dataset.make_one_shot_iterator()
        source_handle = source_iterator.string_handle()

    @function.Defun(dtypes.string)
    def LoadingFunc(h):
        remote_iterator = iterator_ops.Iterator.from_string_handle(
            h, source_dataset.output_types, source_dataset.output_shapes)
        return remote_iterator.get_next()

    def MapFn(unused_input):
        return functional_ops.remote_call(
            args=[source_handle],
            Tout=[dtypes.string],
            f=LoadingFunc,
            target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)

    with ops.device('/job:%s' % worker_job):
        # TODO(saeta,mrry): Switch to using _GeneratorDataset.

        # identity = lambda x: x
        # dummy = constant_op.constant(0)
        # output_dataset = dataset_ops._GeneratorDataset(dummy, identity, MapFn,
        #                                                identity)

        output_dataset = dataset_ops.Dataset.range(2).repeat().map(MapFn)
        output_dataset = output_dataset.prefetch(1)

        if batch_transfer_size:
            # Undo the batching used during the transfer.
            output_dataset = output_dataset.apply(
                batching.unbatch()).prefetch(1)

    return output_dataset
Exemple #12
0
 def _dataset_fn():
     dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float)
     # Want to produce a fixed, known shape, so drop remainder when batching.
     dataset = dataset.apply(batching.batch_and_drop_remainder(4))
     return dataset
Exemple #13
0
def make_batched_features_dataset(file_pattern,
                                  batch_size,
                                  features,
                                  reader=core_readers.TFRecordDataset,
                                  reader_args=None,
                                  num_epochs=None,
                                  shuffle=True,
                                  shuffle_buffer_size=10000,
                                  shuffle_seed=None,
                                  prefetch_buffer_size=1,
                                  reader_num_threads=1,
                                  parser_num_threads=2,
                                  sloppy_ordering=False,
                                  drop_final_batch=False):
    """Returns a `Dataset` of feature dictionaries from `Example` protos.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of consecutive elements of this
      dataset to combine in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step (default is 1).
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements. Each `dict` maps feature keys to
    `Tensor` or `SparseTensor` objects.
  """
    # Create dataset of all matching filenames
    filenames = _get_file_names(file_pattern, False)
    dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
    if shuffle:
        dataset = dataset.shuffle(len(filenames), shuffle_seed)

    # Read `Example` records from files as tensor objects.
    if reader_args is None:
        reader_args = []

    # Read files sequentially (if reader_num_threads=1) or in parallel
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            lambda filename: reader(filename, *reader_args),
            cycle_length=reader_num_threads,
            sloppy=sloppy_ordering))

    # Extract values if the `Example` tensors are stored as key-value tuples.
    if dataset.output_types == (dtypes.string, dtypes.string):
        dataset = dataset.map(lambda _, v: v)

    # Apply dataset repeat and shuffle transformations.
    repeat_dataset = (num_epochs != 1)
    if repeat_dataset and shuffle:
        # Used fused shuffle_and_repeat operation for better performance
        dataset = dataset.apply(
            shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                           shuffle_seed))
    elif repeat_dataset:
        dataset = dataset.repeat(num_epochs)
    elif shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)

    if drop_final_batch:
        dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
    else:
        dataset = dataset.batch(batch_size)

    # Parse `Example` tensors to a dictionary of `Feature` tensors.
    dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features),
                          num_parallel_calls=parser_num_threads)

    # TODO(rachelim): Add an optional label_name argument for extracting the label
    # from the features dictionary, to comply with the type expected by the
    # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
    dataset = dataset.prefetch(prefetch_buffer_size)
    return dataset
Exemple #14
0
def make_tf_record_dataset(
    file_pattern,
    batch_size,
    parser_fn=None,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=None,
    shuffle_seed=None,
    prefetch_buffer_size=None,
    num_parallel_reads=None,
    num_parallel_parser_calls=None,
    drop_final_batch=False):
  """Reads and optionally parses TFRecord files into a dataset.

  Provides common functionality such as batching, optional parsing, shuffling,
  and performant defaults.

  Args:
    file_pattern: List of files or patterns of TFRecord file paths.
      See @{tf.gfile.Glob} for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    parser_fn: (Optional.) A function accepting string input to parse
      and process the record contents. This function must map records
      to components of a fixed shape, so they may be batched. By
      default, uses the record contents unmodified.
    num_epochs: (Optional.) An int specifying the number of times this
      dataset is repeated.  If None (the default), cycles through the
      dataset forever.
    shuffle: (Optional.) A bool that indicates whether the input
      should be shuffled. Defaults to `True`.
    shuffle_buffer_size: (Optional.) Buffer size to use for
      shuffling. A large buffer size ensures better shuffling, but
      increases memory usage and startup time.
    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
    prefetch_buffer_size: (Optional.) An int specifying the number of
      feature batches to prefetch for performance improvement.
      Defaults to auto-tune. Set to 0 to disable prefetching.
    num_parallel_reads: (Optional.) Number of threads used to read
      records from files. By default or if set to a value >1, the
      results will be interleaved.
    num_parallel_parser_calls: (Optional.) Number of parallel
      records to parse in parallel. Defaults to an automatic selection.
    drop_final_batch: (Optional.) Whether the last batch should be
      dropped in case its size is smaller than `batch_size`; the
      default behavior is not to drop the smaller batch.

  Returns:
    A dataset, where each element matches the output of `parser_fn`
    except it will have an additional leading `batch-size` dimension,
    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
    unspecified.
  """
  files = dataset_ops.Dataset.list_files(
      file_pattern, shuffle=shuffle, seed=shuffle_seed)

  if num_parallel_reads is None:
    # Note: We considered auto-tuning this value, but there is a concern
    # that this affects the mixing of records from different files, which
    # could affect training convergence/accuracy, so we are defaulting to
    # a constant for now.
    num_parallel_reads = 24
  dataset = core_readers.TFRecordDataset(
      files, num_parallel_reads=num_parallel_reads)

  if shuffle_buffer_size is None:
    # TODO(josh11b): Auto-tune this value when not specified
    shuffle_buffer_size = 10000
  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  if parser_fn is None:
    if drop_final_batch:
      dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
    else:
      dataset = dataset.batch(batch_size)
  else:
    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
    # of num cores instead of map_and_batch's default behavior of one batch.
    dataset = dataset.apply(batching.map_and_batch(
        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
        drop_remainder=drop_final_batch))

  if prefetch_buffer_size is None:
    prefetch_buffer_size = -1  # tf.config.data.AUTOTUNE
  if prefetch_buffer_size == 0:
    return dataset
  else:
    return dataset.prefetch(buffer_size=prefetch_buffer_size)
Exemple #15
0
def StreamingFilesDataset(files,
                          filetype=None,
                          file_reader_job=None,
                          worker_job=None,
                          num_epochs=None,
                          filename_shuffle_buffer_size=None,
                          num_parallel_reads=None,
                          batch_transfer_size=None,
                          sloppy=None):
  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).

  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
  files local to your GCE VM. In order to train using files stored on your local
  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
  helper to generate a dataset to feed your Cloud TPU with files from your GCE
  VM.

  The resulting dataset may return an OutOfRangeError if there are no files
  found as a result of the fileglob expansion.

  Note: StreamingFilesDataset assumes that the session is using a
  TPUClusterResolver and has therefore a worker and a coordinator job. File
  loading will be done on the coordinator job.

  Args:
    files: A string glob to match files, or a `tf.data.Dataset` generating file
      names.
    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
      TensorFlow function that when given a filename returns a dataset.
    file_reader_job: An optional string that corresponds to the job that should
      perform the file reads.
    worker_job: An optional string that corresponds to the job that should
      process the tensors (i.e. your GPU or TPU worker).
    num_epochs: The number of epochs through the training set that should be
      generated. By default, it will repeat infinitely.
    filename_shuffle_buffer_size: An optional integer whose value controls the
      shuffling of the file names. If you would like to read from the files in
      the same order, set to 0 or False.
    num_parallel_reads: An optional integer controlling the number of files to
      read from concurrently. (Set to 1 for no parallelism.)
    batch_transfer_size: An optional integer controlling the batching used to
      amortize the remote function invocation overhead. Set to a very large
      number to increase throughput. Set to a very small number to reduce memory
      consumption. Set to False to skip batching.
    sloppy: (Optional.) If `True`, read input data as fast as possible, without
      maintaining a deterministic order. Defaults to `False`.
  Returns:
    A `tf.data.Dataset` with an infinite stream of elements generated by a
    parallel interleaving of the set of files matched (or generated) by `files`
    with a type is the output of the dataset specified by `filetype`.

  Raises:
    ValueError: if any argument is not of the expected type.
  """
  if filetype is None:
    filetype = 'tfrecord'

  if isinstance(filetype, str):
    if filetype not in _FILETYPE_MAP:
      raise ValueError('Unexpected filetype: %s' % filetype)
    reader_fn = _FILETYPE_MAP[filetype]
  elif callable(filetype):
    reader_fn = filetype
  else:
    raise ValueError('filetype should be a string or a callable')

  file_reader_job = file_reader_job or 'coordinator'

  worker_job = worker_job or 'tpu_worker'

  if filename_shuffle_buffer_size is None:
    filename_shuffle_buffer_size = 4096

  num_parallel_reads = num_parallel_reads or 8

  if batch_transfer_size is None:
    batch_transfer_size = 1024

  if sloppy is None:
    sloppy = False

  with ops.device('/job:%s' % file_reader_job):
    if isinstance(files, str):
      source_dataset = dataset_ops.Dataset.list_files(files)
    elif isinstance(files, dataset_ops.Dataset):
      source_dataset = files
    else:
      raise ValueError('files was not a string or a dataset: %s' % files)

    if filename_shuffle_buffer_size:
      source_dataset = source_dataset.shuffle(
          buffer_size=filename_shuffle_buffer_size)

    # NOTE: We perform the `repeat` on the source dataset, because the output
    # dataset does not currently have enough information to recreate an iterator
    # over the source dataset when it reaches the end.
    source_dataset = source_dataset.repeat(num_epochs)

    source_dataset = source_dataset.apply(
        interleave_ops.parallel_interleave(
            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))

    if batch_transfer_size:
      # Note: we can safely call batch_and_drop_remainder because we have an
      # infinite stream of TFRecords.
      source_dataset = source_dataset.apply(
          batching.batch_and_drop_remainder(batch_transfer_size))

    source_dataset = source_dataset.prefetch(1)

    source_iterator = source_dataset.make_one_shot_iterator()
    source_handle = source_iterator.string_handle()

  @function.Defun(dtypes.string)
  def LoadingFunc(h):
    remote_iterator = iterator_ops.Iterator.from_string_handle(
        h, source_dataset.output_types, source_dataset.output_shapes)
    return remote_iterator.get_next()

  def MapFn(unused_input):
    return functional_ops.remote_call(
        args=[source_handle],
        Tout=[dtypes.string],
        f=LoadingFunc,
        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)

  with ops.device('/job:%s' % worker_job):
    # TODO(saeta,mrry): Switch to using _GeneratorDataset.

    # identity = lambda x: x
    # dummy = constant_op.constant(0)
    # output_dataset = dataset_ops._GeneratorDataset(dummy, identity, MapFn,
    #                                                identity)

    output_dataset = dataset_ops.Dataset.range(2).repeat().map(MapFn)
    output_dataset = output_dataset.prefetch(1)

    if batch_transfer_size:
      # Undo the batching used during the transfer.
      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)

  return output_dataset
Exemple #16
0
def make_tf_record_dataset(
    file_pattern,
    batch_size,
    parser_fn=None,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=None,
    shuffle_seed=None,
    prefetch_buffer_size=None,
    num_parallel_reads=None,
    num_parallel_parser_calls=None,
    drop_final_batch=False):
  """Reads and optionally parses TFRecord files into a dataset.

  Provides common functionality such as batching, optional parsing, shuffling,
  and performant defaults.

  Args:
    file_pattern: List of files or patterns of TFRecord file paths.
      See @{tf.gfile.Glob} for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    parser_fn: (Optional.) A function accepting string input to parse
      and process the record contents. This function must map records
      to components of a fixed shape, so they may be batched. By
      default, uses the record contents unmodified.
    num_epochs: (Optional.) An int specifying the number of times this
      dataset is repeated.  If None (the default), cycles through the
      dataset forever.
    shuffle: (Optional.) A bool that indicates whether the input
      should be shuffled. Defaults to `True`.
    shuffle_buffer_size: (Optional.) Buffer size to use for
      shuffling. A large buffer size ensures better shuffling, but
      increases memory usage and startup time.
    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
    prefetch_buffer_size: (Optional.) An int specifying the number of
      feature batches to prefetch for performance improvement.
      Defaults to auto-tune. Set to 0 to disable prefetching.
    num_parallel_reads: (Optional.) Number of threads used to read
      records from files. By default or if set to a value >1, the
      results will be interleaved.
    num_parallel_parser_calls: (Optional.) Number of parallel
      records to parse in parallel. Defaults to an automatic selection.
    drop_final_batch: (Optional.) Whether the last batch should be
      dropped in case its size is smaller than `batch_size`; the
      default behavior is not to drop the smaller batch.

  Returns:
    A dataset, where each element matches the output of `parser_fn`
    except it will have an additional leading `batch-size` dimension,
    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
    unspecified.
  """
  files = dataset_ops.Dataset.list_files(
      file_pattern, shuffle=shuffle, seed=shuffle_seed)

  if num_parallel_reads is None:
    # Note: We considered auto-tuning this value, but there is a concern
    # that this affects the mixing of records from different files, which
    # could affect training convergence/accuracy, so we are defaulting to
    # a constant for now.
    num_parallel_reads = 24
  dataset = core_readers.TFRecordDataset(
      files, num_parallel_reads=num_parallel_reads)

  if shuffle_buffer_size is None:
    # TODO(josh11b): Auto-tune this value when not specified
    shuffle_buffer_size = 10000
  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  if parser_fn is None:
    if drop_final_batch:
      dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
    else:
      dataset = dataset.batch(batch_size)
  else:
    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
    # of num cores instead of map_and_batch's default behavior of one batch.
    dataset = dataset.apply(batching.map_and_batch(
        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
        drop_remainder=drop_final_batch))

  if prefetch_buffer_size is None:
    prefetch_buffer_size = -1  # tf.config.data.AUTOTUNE
  if prefetch_buffer_size == 0:
    return dataset
  else:
    return dataset.prefetch(buffer_size=prefetch_buffer_size)
Exemple #17
0
def make_batched_features_dataset(file_pattern,
                                  batch_size,
                                  features,
                                  reader=core_readers.TFRecordDataset,
                                  reader_args=None,
                                  num_epochs=None,
                                  shuffle=True,
                                  shuffle_buffer_size=10000,
                                  shuffle_seed=None,
                                  prefetch_buffer_size=1,
                                  reader_num_threads=1,
                                  parser_num_threads=2,
                                  sloppy_ordering=False,
                                  drop_final_batch=False):
  """Returns a `Dataset` of feature dictionaries from `Example` protos.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step (default is 1).
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements. Each `dict` maps feature keys to
    `Tensor` or `SparseTensor` objects.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Read `Example` records from files as tensor objects.
  if reader_args is None:
    reader_args = []

  # Read files sequentially (if reader_num_threads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          lambda filename: reader(filename, *reader_args),
          cycle_length=reader_num_threads,
          sloppy=sloppy_ordering))

  # Extract values if the `Example` tensors are stored as key-value tuples.
  if dataset.output_types == (dtypes.string, dtypes.string):
    dataset = dataset.map(lambda _, v: v)

  # Apply dataset repeat and shuffle transformations.
  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  if drop_final_batch:
    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
  else:
    dataset = dataset.batch(batch_size)

  # Parse `Example` tensors to a dictionary of `Feature` tensors.
  dataset = dataset.map(
      lambda x: parsing_ops.parse_example(x, features),
      num_parallel_calls=parser_num_threads)

  # TODO(rachelim): Add an optional label_name argument for extracting the label
  # from the features dictionary, to comply with the type expected by the
  # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
  dataset = dataset.prefetch(prefetch_buffer_size)
  return dataset
 def _dataset_fn():
   dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float)
   # Want to produce a fixed, known shape, so drop remainder when batching.
   dataset = dataset.apply(batching.batch_and_drop_remainder(4))
   return dataset