def dataset_fn(self, input_values, cycle_length, block_length, sloppy,
                 buffer_output_elements, prefetch_input_elements):

    def map_py_fn(x):
      self.write_coordination_events[x].wait()
      self.write_coordination_events[x].clear()
      self.read_coordination_events[x].release()
      if self.error:
        err = self.error
        self.error = None
        raise err  # pylint: disable=raising-bad-type
      return x * x

    def map_fn(x):
      return script_ops.py_func(map_py_fn, [x], x.dtype)

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(x)
      return dataset.map(map_fn)

    return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
        self.repeat_count).apply(
            interleave_ops.parallel_interleave(
                interleave_fn, cycle_length, block_length, sloppy,
                buffer_output_elements, prefetch_input_elements))
 def testWorkersGreaterThanNumFiles(self):
   dataset = dataset_ops.Dataset.list_files(self.test_filenames)
   dataset = dataset.apply(
       interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
   dataset = dataset.batch(5)
   dataset = distribute._AutoShardDataset(dataset, 500, 499)
   self.assertDatasetProduces(dataset, [])
Esempio n. 3
0
  def _make_parallel_scan_dataset(self, ds, num_parallel_scans,
                                  normalized_probability, normalized_columns):
    """Builds a parallel dataset from a given range.

    Args:
      ds: A `_BigtableSampleKeyPairsDataset` returning ranges of keys to use.
      num_parallel_scans: The number of concurrent parallel scans to use.
      normalized_probability: A number between 0 and 1 for the keep probability.
      normalized_columns: The column families and column qualifiers to retrieve.

    Returns:
      A `tf.data.Dataset` representing the result of the parallel scan.
    """
    if num_parallel_scans is None:
      num_parallel_scans = 50

    ds = ds.shuffle(buffer_size=10000)  # TODO(saeta): Make configurable.

    def _interleave_fn(start, end):
      return _BigtableScanDataset(
          self,
          prefix="",
          start=start,
          end=end,
          normalized=normalized_columns,
          probability=normalized_probability)

    # Note prefetch_input_elements must be set in order to avoid rpc timeouts.
    ds = ds.apply(
        interleave_ops.parallel_interleave(
            _interleave_fn,
            cycle_length=num_parallel_scans,
            sloppy=True,
            prefetch_input_elements=1))
    return ds
 def dataset_fn(input_values, cycle_length, block_length, sloppy,
                buffer_output_elements, prefetch_input_elements):
     return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
         self.repeat_count).apply(
             interleave_ops.parallel_interleave(
                 interleave_fn, cycle_length, block_length, sloppy,
                 buffer_output_elements, prefetch_input_elements))
  def testShutdownRace(self):
    dataset = dataset_ops.Dataset.range(20)
    map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1))
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            map_fn,
            cycle_length=3,
            sloppy=False,
            buffer_output_elements=1,
            prefetch_input_elements=0))
    dataset = dataset.batch(32)
    iterator = dataset.make_initializable_iterator()
    next_element = iterator.get_next()

    results = []
    with self.cached_session() as sess:
      for _ in range(2):
        elements = []
        sess.run(iterator.initializer)
        try:
          while True:
            elements.extend(sess.run(next_element))
        except errors.OutOfRangeError:
          pass
        results.append(elements)

    self.assertAllEqual(results[0], results[1])
    def dataset_fn(self, input_values, cycle_length, block_length, sloppy,
                   buffer_output_elements, prefetch_input_elements):
        def map_py_fn(x):
            self.write_coordination_events[x].wait()
            self.write_coordination_events[x].clear()
            self.read_coordination_events[x].release()
            if self.error:
                err = self.error
                self.error = None
                raise err  # pylint: disable=raising-bad-type
            return x * x

        def map_fn(x):
            return script_ops.py_func(map_py_fn, [x], x.dtype)

        def interleave_fn(x):
            dataset = dataset_ops.Dataset.from_tensors(x)
            dataset = dataset.repeat(x)
            return dataset.map(map_fn)

        return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
            self.repeat_count).apply(
                interleave_ops.parallel_interleave(interleave_fn, cycle_length,
                                                   block_length, sloppy,
                                                   buffer_output_elements,
                                                   prefetch_input_elements))
    def testSparse(self):
        def _map_fn(i):
            return sparse_tensor.SparseTensor(indices=[[0, 0], [1, 1]],
                                              values=(i * [1, -1]),
                                              dense_shape=[2, 2])

        def _interleave_fn(x):
            return dataset_ops.Dataset.from_tensor_slices(
                sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))

        dataset = dataset_ops.Dataset.range(10).map(_map_fn)
        iterator = dataset_ops.make_initializable_iterator(
            dataset.apply(
                interleave_ops.parallel_interleave(_interleave_fn,
                                                   cycle_length=1)))
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            self.evaluate(init_op)
            for i in range(10):
                for j in range(2):
                    expected = [i, 0] if j % 2 == 0 else [0, -i]
                    self.assertAllEqual(expected, self.evaluate(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                self.evaluate(get_next)
Esempio n. 8
0
 def testWorkersGreaterThanNumFiles(self):
   dataset = dataset_ops.Dataset.list_files(self.test_filenames)
   dataset = dataset.apply(
       interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
   dataset = dataset.batch(5)
   dataset = distribute._AutoShardDataset(dataset, 500, 499)
   self.assertDatasetProduces(dataset, [])
Esempio n. 9
0
    def parallel_read_rows(self,
                           cycle_length=None,
                           sloppy=False,
                           block_length=1):
        """Retrieves rows from the BigQuery service in parallel streams.

    ```
    bq_client = BigQueryClient()
    bq_read_session = bq_client.read_session(...)
    ds1 = bq_read_session.parallel_read_rows(...)
    ```
    Args:
      cycle_length: number of threads to run in parallel. If not specified, it
        is defaulted to the number of streams in a read session.
      sloppy: If false, elements are produced in deterministic order. Otherwise,
        the implementation is allowed, for the sake of expediency, to produce
        elements in a non-deterministic order.
      block_length: The number of consecutive elements to pull from an input
        `Dataset` before advancing to the next input `Dataset`.

    Returns:
      A `tf.data.Dataset` returning the row keys and the cell contents.

    Raises:
      ValueError: If the configured probability is unexpected.

    """
        if cycle_length is None:
            cycle_length = self._requested_streams
        streams_ds = dataset_ops.Dataset.from_tensor_slices(self._streams)
        return streams_ds.apply(
            interleave_ops.parallel_interleave(self.read_rows,
                                               cycle_length=cycle_length,
                                               sloppy=sloppy,
                                               block_length=block_length))
    def test_no_stateful_ops_interleave(self, use_function,
                                        use_legacy_interleave):
        self._set_seed()
        with test_util.deterministic_ops():

            def interleave_fn(x):
                del x
                return dataset_ops.Dataset.range(2)

            if use_function:
                interleave_fn = def_function.function(interleave_fn)

            dataset = dataset_ops.Dataset.range(5)
            if use_legacy_interleave:
                dataset = dataset.apply(
                    testing.assert_next(["LegacyParallelInterleaveV2"]))
                dataset = dataset.apply(
                    interleave_ops.parallel_interleave(interleave_fn,
                                                       cycle_length=5))
            else:
                dataset = dataset.apply(
                    testing.assert_next(["ParallelInterleave"]))
                dataset = dataset.interleave(interleave_fn,
                                             cycle_length=5,
                                             num_parallel_calls=3)
            options = options_lib.Options()
            options.experimental_optimization.apply_default_optimizations = False
            dataset = dataset.with_options(options)
            self.evaluate(variables.global_variables_initializer())
            self.assertDatasetProduces(dataset,
                                       expected_output=[0] * 5 + [1] * 5)
  def testShutdownRace(self):
    dataset = dataset_ops.Dataset.range(20)
    map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1))
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            map_fn,
            cycle_length=3,
            sloppy=False,
            buffer_output_elements=1,
            prefetch_input_elements=0))
    dataset = dataset.batch(32)
    iterator = dataset.make_initializable_iterator()
    next_element = iterator.get_next()

    results = []
    with self.cached_session() as sess:
      for _ in range(2):
        elements = []
        self.evaluate(iterator.initializer)
        try:
          while True:
            elements.extend(sess.run(next_element))
        except errors.OutOfRangeError:
          pass
        results.append(elements)

    self.assertAllEqual(results[0], results[1])
    def test_stateful_ops_interleave(self, use_function,
                                     use_legacy_interleave):
        with test_util.deterministic_ops():

            v = variables.Variable(0.)

            def map_fn(x):
                v.assign_add(1.)
                return (x, v.read_value())

            def interleave_fn(x):
                del x
                return dataset_ops.Dataset.range(2).map(map_fn)

            if use_function:
                map_fn = def_function.function(map_fn)
                interleave_fn = def_function.function(interleave_fn)

            dataset = dataset_ops.Dataset.range(5)
            if use_legacy_interleave:
                dataset = dataset.apply(
                    interleave_ops.parallel_interleave(interleave_fn,
                                                       cycle_length=5))
            else:
                dataset = dataset.interleave(interleave_fn,
                                             cycle_length=5,
                                             num_parallel_calls=3)
            options = options_lib.Options()
            options.experimental_optimization.apply_default_optimizations = False
            dataset = dataset.with_options(options)
            self.evaluate(variables.global_variables_initializer())
            expected_output = list(zip([0] * 5 + [1] * 5, range(1, 11)))
            self.assertDatasetProduces(dataset,
                                       expected_output=expected_output,
                                       requires_initialization=True)
Esempio n. 13
0
    def _make_parallel_scan_dataset(self, ds, num_parallel_scans,
                                    normalized_probability,
                                    normalized_columns):
        """Builds a parallel dataset from a given range.

    Args:
      ds: A `_BigtableSampleKeyPairsDataset` returning ranges of keys to use.
      num_parallel_scans: The number of concurrent parallel scans to use.
      normalized_probability: A number between 0 and 1 for the keep probability.
      normalized_columns: The column families and column qualifiers to retrieve.

    Returns:
      A `tf.data.Dataset` representing the result of the parallel scan.
    """
        if num_parallel_scans is None:
            num_parallel_scans = 50

        ds = ds.shuffle(buffer_size=10000)  # TODO(saeta): Make configurable.

        def _interleave_fn(start, end):
            return _BigtableScanDataset(self,
                                        prefix="",
                                        start=start,
                                        end=end,
                                        normalized=normalized_columns,
                                        probability=normalized_probability)

        # Note prefetch_input_elements must be set in order to avoid rpc timeouts.
        ds = ds.apply(
            interleave_ops.parallel_interleave(_interleave_fn,
                                               cycle_length=num_parallel_scans,
                                               sloppy=True,
                                               prefetch_input_elements=1))
        return ds
 def dataset_fn(input_values, cycle_length, block_length, sloppy,
                buffer_output_elements, prefetch_input_elements):
   return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
       self.repeat_count).apply(
           interleave_ops.parallel_interleave(
               interleave_fn, cycle_length, block_length, sloppy,
               buffer_output_elements, prefetch_input_elements))
 def build_dataset():
     dataset = dataset_ops.Dataset.list_files(self._filenames,
                                              shuffle=False)
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             core_readers.TFRecordDataset, 10))
     dataset = distribute._AutoShardDataset(dataset, 5, 3)
     return dataset
Esempio n. 16
0
  def testZipReaderPipeline(self):
    dataset1 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=False)
    dataset1 = dataset1.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset2 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=False)
    dataset2 = dataset2.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))

    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        (b"Record %d of file %d" % (r, f), b"Record %d of file %d" % (r, f))  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]

    self.assertDatasetProduces(dataset, expected)
  def testZipReaderPipeline(self):
    dataset1 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=False)
    dataset1 = dataset1.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset2 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=False)
    dataset2 = dataset2.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))

    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        (b"Record %d of file %d" % (r, f), b"Record %d of file %d" % (r, f))  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]

    self.assertDatasetProduces(dataset, expected)
Esempio n. 18
0
def parallel_interleave(map_func,
                        cycle_length,
                        block_length=1,
                        sloppy=False,
                        buffer_output_elements=None,
                        prefetch_input_elements=None):
    """A parallel version of the `Dataset.interleave()` transformation.

  `parallel_interleave()` maps `map_func` across its input to produce nested
  datasets, and outputs their elements interleaved. Unlike
  `tf.data.Dataset.interleave`, it gets elements from `cycle_length` nested
  datasets in parallel, which increases the throughput, especially in the
  presence of stragglers. Furthermore, the `sloppy` argument can be used to
  improve performance, by relaxing the requirement that the outputs are produced
  in a deterministic order, and allowing the implementation to skip over nested
  datasets whose elements are not readily available when requested.

  Example usage:

  ```python
  # Preprocess 4 files concurrently.
  filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
  dataset = filenames.apply(
      tf.contrib.data.parallel_interleave(
          lambda filename: tf.data.TFRecordDataset(filename),
          cycle_length=4))
  ```

  WARNING: If `sloppy` is `True`, the order of produced elements is not
  deterministic.

  Args:
    map_func: A function mapping a nested structure of tensors to a `Dataset`.
    cycle_length: The number of input `Dataset`s to interleave from in parallel.
    block_length: The number of consecutive elements to pull from an input
      `Dataset` before advancing to the next input `Dataset`.
    sloppy: If false, elements are produced in deterministic order. Otherwise,
      the implementation is allowed, for the sake of expediency, to produce
      elements in a non-deterministic order.
    buffer_output_elements: The number of elements each iterator being
      interleaved should buffer (similar to the `.prefetch()` transformation for
      each interleaved iterator).
    prefetch_input_elements: The number of input elements to transform to
      iterators before they are needed for interleaving.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  """
    return interleave_ops.parallel_interleave(map_func, cycle_length,
                                              block_length, sloppy,
                                              buffer_output_elements,
                                              prefetch_input_elements)
Esempio n. 19
0
def sloppy_interleave(map_func, cycle_length, block_length=1):
    """A non-deterministic version of the `Dataset.interleave()` transformation.

  `sloppy_interleave()` maps `map_func` across `dataset`, and
  non-deterministically interleaves the results.

  The resulting dataset is almost identical to `interleave`. The key
  difference is that if retrieving a value from a given output iterator would
  cause `get_next` to block, that iterator will be skipped, and consumed
  when next available. If consuming from all iterators would cause the
  `get_next` call to block, the `get_next` call blocks until the first value is
  available.

  If the underlying datasets produce elements as fast as they are consumed, the
  `sloppy_interleave` transformation behaves identically to `interleave`.
  However, if an underlying dataset would block the consumer,
  `sloppy_interleave` can violate the round-robin order (that `interleave`
  strictly obeys), producing an element from a different underlying
  dataset instead.

  Example usage:

  ```python
  # Preprocess 4 files concurrently.
  filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
  dataset = filenames.apply(
      tf.contrib.data.sloppy_interleave(
          lambda filename: tf.data.TFRecordDataset(filename),
          cycle_length=4))
  ```

  WARNING: The order of elements in the resulting dataset is not
  deterministic. Use `Dataset.interleave()` if you want the elements to have a
  deterministic order.

  Args:
    map_func: A function mapping a nested structure of tensors (having shapes
      and types defined by `self.output_shapes` and `self.output_types`) to a
      `Dataset`.
    cycle_length: The number of input `Dataset`s to interleave from in parallel.
    block_length: The number of consecutive elements to pull from an input
      `Dataset` before advancing to the next input `Dataset`. Note:
      `sloppy_interleave` will skip the remainder of elements in the
      `block_length` in order to avoid blocking.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  """
    return interleave_ops.parallel_interleave(map_func,
                                              cycle_length,
                                              block_length,
                                              sloppy=True)
Esempio n. 20
0
def parallel_interleave(map_func,
                        cycle_length,
                        block_length=1,
                        sloppy=False,
                        buffer_output_elements=None,
                        prefetch_input_elements=None):
  """A parallel version of the `Dataset.interleave()` transformation.

  `parallel_interleave()` maps `map_func` across its input to produce nested
  datasets, and outputs their elements interleaved. Unlike
  `tf.data.Dataset.interleave`, it gets elements from `cycle_length` nested
  datasets in parallel, which increases the throughput, especially in the
  presence of stragglers. Furthermore, the `sloppy` argument can be used to
  improve performance, by relaxing the requirement that the outputs are produced
  in a deterministic order, and allowing the implementation to skip over nested
  datasets whose elements are not readily available when requested.

  Example usage:

  ```python
  # Preprocess 4 files concurrently.
  filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
  dataset = filenames.apply(
      tf.data.experimental.parallel_interleave(
          lambda filename: tf.data.TFRecordDataset(filename),
          cycle_length=4))
  ```

  WARNING: If `sloppy` is `True`, the order of produced elements is not
  deterministic.

  Args:
    map_func: A function mapping a nested structure of tensors to a `Dataset`.
    cycle_length: The number of input `Dataset`s to interleave from in parallel.
    block_length: The number of consecutive elements to pull from an input
      `Dataset` before advancing to the next input `Dataset`.
    sloppy: If false, elements are produced in deterministic order. Otherwise,
      the implementation is allowed, for the sake of expediency, to produce
      elements in a non-deterministic order.
    buffer_output_elements: The number of elements each iterator being
      interleaved should buffer (similar to the `.prefetch()` transformation for
      each interleaved iterator).
    prefetch_input_elements: The number of input elements to transform to
      iterators before they are needed for interleaving.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  """
  return interleave_ops.parallel_interleave(
      map_func, cycle_length, block_length, sloppy, buffer_output_elements,
      prefetch_input_elements)
Esempio n. 21
0
  def testConcatenateReaderPipeline(self, shuffle):
    dataset1 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=shuffle)
    dataset1 = dataset1.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset1 = dataset1.batch(5)
    dataset2 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=shuffle)
    dataset2 = dataset2.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset2 = dataset2.batch(5)

    dataset = dataset1.concatenate(dataset2)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    expected += expected
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 8, shuffle)
  def testConcatenateReaderPipeline(self, shuffle):
    dataset1 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=shuffle)
    dataset1 = dataset1.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset1 = dataset1.batch(5)
    dataset2 = dataset_ops.Dataset.list_files(
        self.test_filenames, shuffle=shuffle)
    dataset2 = dataset2.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset2 = dataset2.batch(5)

    dataset = dataset1.concatenate(dataset2)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    expected += expected
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 8, shuffle)
    def setUp(self):

        self.input_values = array_ops.placeholder(dtypes.int64, shape=[None])
        self.cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
        self.block_length = array_ops.placeholder(dtypes.int64, shape=[])
        self.sloppy = array_ops.placeholder(dtypes.bool, shape=[])
        self.buffer_output_elements = array_ops.placeholder(dtypes.int64,
                                                            shape=[])
        self.prefetch_input_elements = array_ops.placeholder(dtypes.int64,
                                                             shape=[])

        self.error = None
        self.repeat_count = 2

        # Set up threading events used to sequence when items are produced that
        # are subsequently interleaved. These events allow us to deterministically
        # simulate slowdowns and force sloppiness.
        self.read_coordination_events = {}
        self.write_coordination_events = {}
        # input values [4, 5, 6] are the common case for the tests; set defaults
        for i in range(4, 7):
            self.read_coordination_events[i] = threading.Semaphore(0)
            self.write_coordination_events[i] = threading.Event()

        def map_py_fn(x):
            self.write_coordination_events[x].wait()
            self.write_coordination_events[x].clear()
            self.read_coordination_events[x].release()
            if self.error:
                err = self.error
                self.error = None
                raise err  # pylint: disable=raising-bad-type
            return x * x

        def map_fn(x):
            return script_ops.py_func(map_py_fn, [x], x.dtype)

        def interleave_fn(x):
            dataset = dataset_ops.Dataset.from_tensors(x)
            dataset = dataset.repeat(x)
            return dataset.map(map_fn)

        self.dataset = (dataset_ops.Dataset.from_tensor_slices(
            self.input_values).repeat(self.repeat_count).apply(
                interleave_ops.parallel_interleave(
                    interleave_fn, self.cycle_length, self.block_length,
                    self.sloppy, self.buffer_output_elements,
                    self.prefetch_input_elements)))
        self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
        self.init_op = self.iterator.initializer
        self.next_element = self.iterator.get_next()
Esempio n. 24
0
def sloppy_interleave(map_func, cycle_length, block_length=1):
  """A non-deterministic version of the `Dataset.interleave()` transformation.

  `sloppy_interleave()` maps `map_func` across `dataset`, and
  non-deterministically interleaves the results.

  The resulting dataset is almost identical to `interleave`. The key
  difference is that if retrieving a value from a given output iterator would
  cause `get_next` to block, that iterator will be skipped, and consumed
  when next available. If consuming from all iterators would cause the
  `get_next` call to block, the `get_next` call blocks until the first value is
  available.

  If the underlying datasets produce elements as fast as they are consumed, the
  `sloppy_interleave` transformation behaves identically to `interleave`.
  However, if an underlying dataset would block the consumer,
  `sloppy_interleave` can violate the round-robin order (that `interleave`
  strictly obeys), producing an element from a different underlying
  dataset instead.

  Example usage:

  ```python
  # Preprocess 4 files concurrently.
  filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
  dataset = filenames.apply(
      tf.contrib.data.sloppy_interleave(
          lambda filename: tf.data.TFRecordDataset(filename),
          cycle_length=4))
  ```

  WARNING: The order of elements in the resulting dataset is not
  deterministic. Use `Dataset.interleave()` if you want the elements to have a
  deterministic order.

  Args:
    map_func: A function mapping a nested structure of tensors (having shapes
      and types defined by `self.output_shapes` and `self.output_types`) to a
      `Dataset`.
    cycle_length: The number of input `Dataset`s to interleave from in parallel.
    block_length: The number of consecutive elements to pull from an input
      `Dataset` before advancing to the next input `Dataset`. Note:
      `sloppy_interleave` will skip the remainder of elements in the
      `block_length` in order to avoid blocking.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  """
  return interleave_ops.parallel_interleave(
      map_func, cycle_length, block_length, sloppy=True)
  def testErrorsInInputFn(self):

    def map_py_fn(x):
      if x == 5:
        raise ValueError()
      return x

    def map_fn(x):
      return script_ops.py_func(map_py_fn, [x], x.dtype)

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(x)
      return dataset

    self.dataset = (
        dataset_ops.Dataset.from_tensor_slices(self.input_values).map(map_fn)
        .repeat(self.repeat_count).apply(
            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
                                               self.block_length, self.sloppy,
                                               self.buffer_output_elements,
                                               self.prefetch_input_elements)))

    self.iterator = self.dataset.make_initializable_iterator()
    self.init_op = self.iterator.initializer
    self.next_element = self.iterator.get_next()

    with self.cached_session() as sess:
      sess.run(
          self.init_op,
          feed_dict={
              self.input_values: [4, 5, 6],
              self.cycle_length: 2,
              self.block_length: 1,
              self.sloppy: False,
              self.buffer_output_elements: 1,
              self.prefetch_input_elements: 0,
          })
      for i, expected_element in enumerate(
          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
        if expected_element == 5:
          with self.assertRaises(errors.InvalidArgumentError):
            sess.run(self.next_element)
        else:
          actual_element = sess.run(self.next_element)
          self.assertEqual(expected_element, actual_element,
                           "At index %s: %s expected, got: %s" %
                           (i, expected_element, actual_element))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(self.next_element)
  def testErrorsInInputFn(self):

    def map_py_fn(x):
      if x == 5:
        raise ValueError()
      return x

    def map_fn(x):
      return script_ops.py_func(map_py_fn, [x], x.dtype)

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(x)
      return dataset

    self.dataset = (
        dataset_ops.Dataset.from_tensor_slices(self.input_values).map(map_fn)
        .repeat(self.repeat_count).apply(
            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
                                               self.block_length, self.sloppy,
                                               self.buffer_output_elements,
                                               self.prefetch_input_elements)))

    self.iterator = self.dataset.make_initializable_iterator()
    self.init_op = self.iterator.initializer
    self.next_element = self.iterator.get_next()

    with self.cached_session() as sess:
      sess.run(
          self.init_op,
          feed_dict={
              self.input_values: [4, 5, 6],
              self.cycle_length: 2,
              self.block_length: 1,
              self.sloppy: False,
              self.buffer_output_elements: 1,
              self.prefetch_input_elements: 0,
          })
      for i, expected_element in enumerate(
          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
        if expected_element == 5:
          with self.assertRaises(errors.InvalidArgumentError):
            sess.run(self.next_element)
        else:
          actual_element = sess.run(self.next_element)
          self.assertEqual(expected_element, actual_element,
                           "At index %s: %s expected, got: %s" %
                           (i, expected_element, actual_element))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(self.next_element)
  def setUp(self):

    self.input_values = array_ops.placeholder(dtypes.int64, shape=[None])
    self.cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
    self.block_length = array_ops.placeholder(dtypes.int64, shape=[])
    self.sloppy = array_ops.placeholder(dtypes.bool, shape=[])
    self.buffer_output_elements = array_ops.placeholder(dtypes.int64, shape=[])
    self.prefetch_input_elements = array_ops.placeholder(dtypes.int64, shape=[])

    self.error = None
    self.repeat_count = 2

    # Set up threading events used to sequence when items are produced that
    # are subsequently interleaved. These events allow us to deterministically
    # simulate slowdowns and force sloppiness.
    self.read_coordination_events = {}
    self.write_coordination_events = {}
    # input values [4, 5, 6] are the common case for the tests; set defaults
    for i in range(4, 7):
      self.read_coordination_events[i] = threading.Semaphore(0)
      self.write_coordination_events[i] = threading.Event()

    def map_py_fn(x):
      self.write_coordination_events[x].wait()
      self.write_coordination_events[x].clear()
      self.read_coordination_events[x].release()
      if self.error:
        err = self.error
        self.error = None
        raise err  # pylint: disable=raising-bad-type
      return x * x

    def map_fn(x):
      return script_ops.py_func(map_py_fn, [x], x.dtype)

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(x)
      return dataset.map(map_fn)

    self.dataset = (
        dataset_ops.Dataset.from_tensor_slices(self.input_values)
        .repeat(self.repeat_count).apply(
            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
                                               self.block_length, self.sloppy,
                                               self.buffer_output_elements,
                                               self.prefetch_input_elements)))
    self.iterator = self.dataset.make_initializable_iterator()
    self.init_op = self.iterator.initializer
    self.next_element = self.iterator.get_next()
Esempio n. 28
0
  def testPipelineWithMap(self, shuffle):
    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
    dataset = dataset.batch(5)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
  def testPipelineWithMap(self, shuffle):
    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
    dataset = dataset.batch(5)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
    def testSampleResNetPipeline(self):
        dataset = dataset_ops.Dataset.list_files(self.test_filenames,
                                                 shuffle=True)
        dataset = dataset.apply(
            interleave_ops.parallel_interleave(core_readers.TFRecordDataset,
                                               10))
        dataset = dataset.batch(5)
        dataset = distribute._AutoShardDataset(dataset, 5, 3)

        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for r in range(0, 10) for f in (3, 8)
        ]
        self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
Esempio n. 31
0
 def apply_interleave(self, interleave_version, dataset, interleave_fn,
                      cycle_length, num_parallel_calls):
     if interleave_version == NON_PARALLEL:
         return dataset.interleave(interleave_fn, cycle_length=cycle_length)
     elif interleave_version == EXPERIMENTAL_PARALLEL:
         return dataset.apply(
             interleave_ops.parallel_interleave(interleave_fn,
                                                cycle_length=cycle_length))
     elif interleave_version == CORE_PARALLEL:
         if not num_parallel_calls:
             num_parallel_calls = cycle_length
         return dataset.interleave(interleave_fn,
                                   cycle_length=cycle_length,
                                   num_parallel_calls=num_parallel_calls)
     else:
         raise ValueError("Unknown version: " + interleave_version)
  def testValidPipelineWithRangeDataset(self, shuffle):
    dataset = dataset_ops.Dataset.range(self._num_files)
    dataset = dataset.map(lambda n: string_ops.string_join(  # pylint:disable=g-long-lambda
        [self.get_temp_dir(),
         string_ops.string_format("/tf_record.{}.txt", [n])]))
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
    dataset = dataset.batch(5)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
Esempio n. 33
0
  def testValidPipelineWithRangeDataset(self, shuffle):
    dataset = dataset_ops.Dataset.range(self._num_files)
    dataset = dataset.map(lambda n: string_ops.string_join(  # pylint:disable=g-long-lambda
        [self.get_temp_dir(),
         string_ops.string_format("/tf_record.{}.txt", [n])]))
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
    dataset = dataset.batch(5)
    dataset = distribute._AutoShardDataset(dataset, 5, 3)

    expected = [
        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for r in range(0, 10)
        for f in (3, 8)
    ]
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
Esempio n. 34
0
def manual_old_parallel_inteleave(filenames):
    filenames_list = gfile.Glob(filenames)
    files_dataset = dataset_ops.Dataset.from_tensor_slices(
        filenames_list).shuffle(len(filenames_list))

    dataset = files_dataset.apply(
        interleave_ops.parallel_interleave(
          lambda filename: tf.data.TFRecordDataset(filename, compression_type="GZIP"),
          cycle_length=ARGS.reader_num_threads,
          sloppy=ARGS.sloppy)) \
      .shuffle(10000) \
      .repeat(ARGS.num_epochs) \
      .batch(ARGS.batch_size) \
      .map(parse_and_transform, num_parallel_calls=ARGS.parser_num_threads) \

    if ARGS.cache:
        dataset = dataset.cache()
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)
  def testSparse(self):
    def _map_fn(i):
      return sparse_tensor.SparseTensor(
          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])

    def _interleave_fn(x):
      return dataset_ops.Dataset.from_tensor_slices(
          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))

    dataset = dataset_ops.Dataset.range(10).map(_map_fn).apply(
        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1))
    get_next = self.getNext(dataset)

    for i in range(10):
      for j in range(2):
        expected = [i, 0] if j % 2 == 0 else [0, -i]
        self.assertAllEqual(expected, self.evaluate(get_next()))
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
        def dataset_fn(delay_ms):
            def interleave_fn(x):
                ds = dataset_ops.Dataset.from_tensors(x)
                if math_ops.equal(x, 0):
                    ds = ds.apply(testing.sleep(delay_ms * 1000))
                else:
                    ds = ds.apply(testing.sleep(0))
                return ds

            dataset = dataset_ops.Dataset.from_tensor_slices(elements)
            dataset = dataset.apply(
                interleave_ops.parallel_interleave(interleave_fn,
                                                   cycle_length=10,
                                                   sloppy=sloppy))

            opts = options_lib.Options()
            opts.deterministic = global_determinism
            dataset = dataset.with_options(opts)
            return dataset
  def _testTooManyReaders(self, sloppy=False):

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(math_ops.cast(x, dtype=dtypes.int64))
      return dataset

    dataset = dataset_ops.Dataset.from_tensor_slices([4, 5, 6])
    dataset = dataset.repeat(self.repeat_count)
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
    get_next = self.getNext(dataset)
    output_values = []
    for _ in range(30):
      output_values.append(self.evaluate(get_next()))

    expected_values = self._interleave(
        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
    self.assertItemsEqual(output_values, expected_values)
Esempio n. 38
0
  def testTooManyReaders(self, sloppy=False):

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(math_ops.cast(x, dtype=dtypes.int64))
      return dataset

    dataset = dataset_ops.Dataset.from_tensor_slices([4, 5, 6])
    dataset = dataset.repeat(self.repeat_count)
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
    get_next = self.getNext(dataset)
    output_values = []
    for _ in range(30):
      output_values.append(self.evaluate(get_next()))

    expected_values = self._interleave(
        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
    self.assertCountEqual(output_values, expected_values)
  def _testTooManyReaders(self, sloppy=False):

    def interleave_fn(x):
      dataset = dataset_ops.Dataset.from_tensors(x)
      dataset = dataset.repeat(math_ops.cast(x, dtype=dtypes.int64))
      return dataset

    dataset = dataset_ops.Dataset.from_tensor_slices([4, 5, 6])
    dataset = dataset.repeat(self.repeat_count)
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
    iterator = dataset.make_one_shot_iterator()

    with self.cached_session() as sess:
      output_values = []
      for _ in range(30):
        output_values.append(sess.run(iterator.get_next()))

    expected_values = self._interleave(
        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
    self.assertItemsEqual(output_values, expected_values)
    def testShutdownRace(self):
        dataset = dataset_ops.Dataset.range(20)
        map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1))
        dataset = dataset.apply(
            interleave_ops.parallel_interleave(map_fn,
                                               cycle_length=3,
                                               sloppy=False,
                                               buffer_output_elements=1,
                                               prefetch_input_elements=0))
        dataset = dataset.batch(32)

        results = []
        for _ in range(2):
            elements = []
            next_element = self.getNext(dataset)
            try:
                while True:
                    elements.extend(self.evaluate(next_element()))
            except errors.OutOfRangeError:
                pass
            results.append(elements)
        self.assertAllEqual(results[0], results[1])
  def testShutdownRace(self):
    dataset = dataset_ops.Dataset.range(20)
    map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1))
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            map_fn,
            cycle_length=3,
            sloppy=False,
            buffer_output_elements=1,
            prefetch_input_elements=0))
    dataset = dataset.batch(32)

    results = []
    for _ in range(2):
      elements = []
      next_element = self.getNext(dataset)
      try:
        while True:
          elements.extend(self.evaluate(next_element()))
      except errors.OutOfRangeError:
        pass
      results.append(elements)
    self.assertAllEqual(results[0], results[1])
  def testSparse(self):
    def _map_fn(i):
      return sparse_tensor.SparseTensor(
          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])

    def _interleave_fn(x):
      return dataset_ops.Dataset.from_tensor_slices(
          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))

    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
    iterator = dataset.apply(
        interleave_ops.parallel_interleave(
            _interleave_fn, cycle_length=1)).make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      for i in range(10):
        for j in range(2):
          expected = [i, 0] if j % 2 == 0 else [0, -i]
          self.assertAllEqual(expected, sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Esempio n. 43
0
 def _build_ds(self, cycle_length, block_length, sloppy=False):
     return (dataset_ops.Dataset.from_tensor_slices(
         self.input_values).repeat(self.num_repeats).apply(
             interleave_ops.parallel_interleave(
                 lambda x: dataset_ops.Dataset.range(10 * x, 11 * x),
                 cycle_length, block_length, sloppy)))
Esempio n. 44
0
def make_batched_features_dataset_v2(file_pattern,
                                     batch_size,
                                     features,
                                     reader=core_readers.TFRecordDataset,
                                     label_key=None,
                                     reader_args=None,
                                     num_epochs=None,
                                     shuffle=True,
                                     shuffle_buffer_size=10000,
                                     shuffle_seed=None,
                                     prefetch_buffer_size=optimization.AUTOTUNE,
                                     reader_num_threads=1,
                                     parser_num_threads=2,
                                     sloppy_ordering=False,
                                     drop_final_batch=False):
  """Returns a `Dataset` of feature dictionaries from `Example` protos.

  If label_key argument is provided, returns a `Dataset` of tuple
  comprising of feature dictionaries and label.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.io.gfile.glob` for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.io.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    label_key: (Optional) A string corresponding to the key labels are stored in
      `tf.Examples`. If provided, it must be one of the `features` key,
      otherwise results in `ValueError`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step. Defaults to auto-tune.
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements, (or a tuple of `dict` elements and label).
    Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.

  Raises:
    TypeError: If `reader` is a `tf.compat.v1.ReaderBase` subclass.
    ValueError: If `label_key` is not one of the `features` keys.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase):
    raise TypeError("The `reader` argument must return a `Dataset` object. "
                    "`tf.ReaderBase` subclasses are not supported. For "
                    "example, pass `tf.data.TFRecordDataset` instead of "
                    "`tf.TFRecordReader`.")

  # Read `Example` records from files as tensor objects.
  if reader_args is None:
    reader_args = []

  # Read files sequentially (if reader_num_threads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          lambda filename: reader(filename, *reader_args),
          cycle_length=reader_num_threads,
          sloppy=sloppy_ordering))

  # Extract values if the `Example` tensors are stored as key-value tuples.
  if dataset_ops.get_legacy_output_types(dataset) == (
      dtypes.string, dtypes.string):
    dataset = dataset_ops.MapDataset(
        dataset, lambda _, v: v, use_inter_op_parallelism=False)

  # Apply dataset repeat and shuffle transformations.
  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
  # improve the shape inference, because it makes the batch dimension static.
  # It is safe to do this because in that case we are repeating the input
  # indefinitely, and all batches will be full-sized.
  dataset = dataset.batch(
      batch_size, drop_remainder=drop_final_batch or num_epochs is None)

  # Parse `Example` tensors to a dictionary of `Feature` tensors.
  dataset = dataset.apply(
      parsing_ops.parse_example_dataset(
          features, num_parallel_calls=parser_num_threads))

  if label_key:
    if label_key not in features:
      raise ValueError(
          "The `label_key` provided (%r) must be one of the `features` keys." %
          label_key)
    dataset = dataset.map(lambda x: (x, x.pop(label_key)))

  dataset = dataset.prefetch(prefetch_buffer_size)
  return dataset
Esempio n. 45
0
def make_csv_dataset_v2(
    file_pattern,
    batch_size,
    column_names=None,
    column_defaults=None,
    label_name=None,
    select_columns=None,
    field_delim=",",
    use_quote_delim=True,
    na_value="",
    header=True,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=None,
    prefetch_buffer_size=optimization.AUTOTUNE,
    num_parallel_reads=1,
    sloppy=False,
    num_rows_for_inference=100,
    compression_type=None,
    ignore_errors=False,
):
  """Reads CSV files into a dataset.

  Reads CSV files into a dataset, where each element is a (features, labels)
  tuple that corresponds to a batch of CSV rows. The features dictionary
  maps feature column names to `Tensor`s containing the corresponding
  feature data, and labels is a `Tensor` containing the batch's label data.

  Args:
    file_pattern: List of files or patterns of file paths containing CSV
      records. See `tf.io.gfile.glob` for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    column_names: An optional list of strings that corresponds to the CSV
      columns, in order. One per column of the input record. If this is not
      provided, infers the column names from the first row of the records.
      These names will be the keys of the features dict of each dataset element.
    column_defaults: A optional list of default values for the CSV fields. One
      item per selected column of the input record. Each item in the list is
      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
      `Tensor` with one of the aforementioned types. The tensor can either be
      a scalar default value (if the column is optional), or an empty tensor (if
      the column is required). If a dtype is provided instead of a tensor, the
      column is also treated as required. If this list is not provided, tries
      to infer types based on reading the first num_rows_for_inference rows of
      files specified, and assumes all columns are optional, defaulting to `0`
      for numeric values and `""` for string values. If both this and
      `select_columns` are specified, these must have the same lengths, and
      `column_defaults` is assumed to be sorted in order of increasing column
      index.
    label_name: A optional string corresponding to the label column. If
      provided, the data for this column is returned as a separate `Tensor` from
      the features dictionary, so that the dataset complies with the format
      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
      function.
    select_columns: An optional list of integer indices or string column
      names, that specifies a subset of columns of CSV data to select. If
      column names are provided, these must correspond to names provided in
      `column_names` or inferred from the file header lines. When this argument
      is specified, only a subset of CSV columns will be parsed and returned,
      corresponding to the columns specified. Using this results in faster
      parsing and lower memory usage. If both this and `column_defaults` are
      specified, these must have the same lengths, and `column_defaults` is
      assumed to be sorted in order of increasing column index.
    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
      separate fields in a record.
    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
      double quotation marks as regular characters inside of the string fields.
    na_value: Additional string to recognize as NA/NaN.
    header: A bool that indicates whether the first rows of provided CSV files
      correspond to header lines with column names, and should not be included
      in the data.
    num_epochs: An int specifying the number of times this dataset is repeated.
      If None, cycles through the dataset forever.
    shuffle: A bool that indicates whether the input should be shuffled.
    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
      ensures better shuffling, but increases memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: An int specifying the number of feature
      batches to prefetch for performance improvement. Recommended value is the
      number of batches consumed per training step. Defaults to auto-tune.

    num_parallel_reads: Number of threads used to read CSV records from files.
      If >1, the results will be interleaved.
    sloppy: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    num_rows_for_inference: Number of rows of a file to use for type inference
      if record_defaults is not provided. If None, reads all the rows of all
      the files. Defaults to 100.
    compression_type: (Optional.) A `tf.string` scalar evaluating to one of
      `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.
    ignore_errors: (Optional.) If `True`, ignores errors with CSV file parsing,
      such as malformed data or empty lines, and moves on to the next valid
      CSV record. Otherwise, the dataset raises an error and stops processing
      when encountering any invalid records. Defaults to `False`.

  Returns:
    A dataset, where each element is a (features, labels) tuple that corresponds
    to a batch of `batch_size` CSV rows. The features dictionary maps feature
    column names to `Tensor`s containing the corresponding column data, and
    labels is a `Tensor` containing the column data for the label column
    specified by `label_name`.

  Raises:
    ValueError: If any of the arguments is malformed.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Clean arguments; figure out column names and defaults

  if column_names is None:
    if not header:
      raise ValueError("Cannot infer column names without a header line.")
    # If column names are not provided, infer from the header lines
    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
  if len(column_names) != len(set(column_names)):
    raise ValueError("Cannot have duplicate column names.")

  if select_columns is not None:
    select_columns = _get_sorted_col_indices(select_columns, column_names)

  if column_defaults is not None:
    column_defaults = [
        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
        for x in column_defaults
    ]
  else:
    # If column defaults are not provided, infer from records at graph
    # construction time
    column_defaults = _infer_column_defaults(
        filenames, len(column_names), field_delim, use_quote_delim, na_value,
        header, num_rows_for_inference, select_columns)

  if select_columns is not None and len(column_defaults) != len(select_columns):
    raise ValueError(
        "If specified, column_defaults and select_columns must have same "
        "length."
    )
  if select_columns is not None and len(column_names) > len(select_columns):
    # Pick the relevant subset of column names
    column_names = [column_names[i] for i in select_columns]

  if label_name is not None and label_name not in column_names:
    raise ValueError("`label_name` provided must be one of the columns.")

  def filename_to_dataset(filename):
    dataset = CsvDataset(
        filename,
        record_defaults=column_defaults,
        field_delim=field_delim,
        use_quote_delim=use_quote_delim,
        na_value=na_value,
        select_cols=select_columns,
        header=header,
        compression_type=compression_type
    )
    if ignore_errors:
      dataset = dataset.apply(error_ops.ignore_errors())
    return dataset

  def map_fn(*columns):
    """Organizes columns into a features dictionary.

    Args:
      *columns: list of `Tensor`s corresponding to one csv record.
    Returns:
      An OrderedDict of feature names to values for that particular record. If
      label_name is provided, extracts the label feature to be returned as the
      second element of the tuple.
    """
    features = collections.OrderedDict(zip(column_names, columns))
    if label_name is not None:
      label = features.pop(label_name)
      return features, label
    return features

  # Read files sequentially (if num_parallel_reads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))

  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  # Apply batch before map for perf, because map has high overhead relative
  # to the size of the computation in each map.
  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
  # improve the shape inference, because it makes the batch dimension static.
  # It is safe to do this because in that case we are repeating the input
  # indefinitely, and all batches will be full-sized.
  dataset = dataset.batch(batch_size=batch_size,
                          drop_remainder=num_epochs is None)
  dataset = dataset_ops.MapDataset(
      dataset, map_fn, use_inter_op_parallelism=False)
  dataset = dataset.prefetch(prefetch_buffer_size)

  return dataset
Esempio n. 46
0
def StreamingFilesDataset(files,
                          filetype=None,
                          file_reader_job=None,
                          worker_job=None,
                          num_epochs=None,
                          filename_shuffle_buffer_size=None,
                          num_parallel_reads=None,
                          batch_transfer_size=None,
                          sloppy=None):
  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).

  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
  files local to your GCE VM. In order to train using files stored on your local
  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
  helper to generate a dataset to feed your Cloud TPU with files from your GCE
  VM.

  The resulting dataset may return an OutOfRangeError if there are no files
  found as a result of the fileglob expansion.

  Note: StreamingFilesDataset assumes that the session is using a
  TPUClusterResolver and has therefore a worker and a coordinator job. File
  loading will be done on the coordinator job.

  Args:
    files: A string glob to match files, or a `tf.data.Dataset` generating file
      names.
    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
      TensorFlow function that when given a filename returns a dataset.
    file_reader_job: An optional string that corresponds to the job that should
      perform the file reads.
    worker_job: An optional string that corresponds to the job that should
      process the tensors (i.e. your GPU or TPU worker).
    num_epochs: The number of epochs through the training set that should be
      generated. By default, it will repeat infinitely.
    filename_shuffle_buffer_size: An optional integer whose value controls the
      shuffling of the file names. If you would like to read from the files in
      the same order, set to 0 or False.
    num_parallel_reads: An optional integer controlling the number of files to
      read from concurrently. (Set to 1 for no parallelism.)
    batch_transfer_size: An optional integer controlling the batching used to
      amortize the remote function invocation overhead. Set to a very large
      number to increase throughput. Set to a very small number to reduce memory
      consumption. Set to False to skip batching.
    sloppy: (Optional.) If `False`, read input data while maintaining a
      deterministic order. (This may have significant performance impacts.)
      sloppy defaults to: True.
  Returns:
    A `tf.data.Dataset` with an infinite stream of elements generated by a
    parallel interleaving of the set of files matched (or generated) by `files`
    with a type is the output of the dataset specified by `filetype`.

  Raises:
    ValueError: if any argument is not of the expected type.
  """
  if filetype is None:
    filetype = 'tfrecord'

  if isinstance(filetype, str):
    if filetype not in _FILETYPE_MAP:
      raise ValueError('Unexpected filetype: %s' % filetype)
    reader_fn = _FILETYPE_MAP[filetype]
  elif callable(filetype):
    reader_fn = filetype
  else:
    raise ValueError('filetype should be a string or a callable')

  file_reader_job = file_reader_job or 'coordinator'

  worker_job = worker_job or 'worker'

  if filename_shuffle_buffer_size is None:
    filename_shuffle_buffer_size = 4096

  num_parallel_reads = num_parallel_reads or 8

  if batch_transfer_size is None:
    batch_transfer_size = 256

  if sloppy is None:
    sloppy = True

  if file_reader_job == 'coordinator':
    file_reader_device = '/job:coordinator/task:0'
  else:
    file_reader_device = '/job:%s' % file_reader_job

  with ops.device(file_reader_device):
    if isinstance(files, str):
      source_dataset = dataset_ops.Dataset.list_files(files)
    elif isinstance(files, dataset_ops.DatasetV2):
      source_dataset = files
    else:
      raise ValueError('files was not a string or a dataset: %s' % files)

    if filename_shuffle_buffer_size:
      source_dataset = source_dataset.shuffle(
          buffer_size=filename_shuffle_buffer_size)

    source_dataset = source_dataset.apply(
        interleave_ops.parallel_interleave(
            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))

    source_dataset = source_dataset.repeat(num_epochs)

    if batch_transfer_size:
      source_dataset = source_dataset.batch(batch_transfer_size)

    source_dataset = source_dataset.prefetch(1)

    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
    source_handle = source_iterator.string_handle()

  @function.Defun(dtypes.string)
  def LoadingFunc(h):
    remote_iterator = iterator_ops.Iterator.from_string_handle(
        h, dataset_ops.get_legacy_output_types(source_dataset),
        dataset_ops.get_legacy_output_shapes(source_dataset))
    return remote_iterator.get_next()

  def MapFn(unused_input):
    source_dataset_output_types = dataset_ops.get_legacy_output_types(
        source_dataset)
    if isinstance(source_dataset_output_types, dtypes.DType):
      output_types = [source_dataset_output_types]
    elif isinstance(source_dataset_output_types, (list, tuple)):
      output_types = source_dataset_output_types
    else:
      raise ValueError('source dataset has invalid output types')
    remote_calls = functional_ops.remote_call(
        args=[source_handle],
        Tout=output_types,
        f=LoadingFunc,
        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
    if len(remote_calls) == 1:
      return remote_calls[0]
    else:
      return remote_calls

  with ops.device('/job:%s' % worker_job):
    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
        MapFn, num_parallel_calls=4 if sloppy else None)
    output_dataset = output_dataset.prefetch(1)

    if batch_transfer_size:
      # Undo the batching used during the transfer.
      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)

  return output_dataset
Esempio n. 47
0
def make_csv_dataset(
    file_pattern,
    batch_size,
    column_names=None,
    column_defaults=None,
    label_name=None,
    select_columns=None,
    field_delim=",",
    use_quote_delim=True,
    na_value="",
    header=True,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=None,
    prefetch_buffer_size=optimization.AUTOTUNE,
    num_parallel_reads=1,
    sloppy=False,
    num_rows_for_inference=100,
    compression_type=None,
):
  """Reads CSV files into a dataset.

  Reads CSV files into a dataset, where each element is a (features, labels)
  tuple that corresponds to a batch of CSV rows. The features dictionary
  maps feature column names to `Tensor`s containing the corresponding
  feature data, and labels is a `Tensor` containing the batch's label data.

  Args:
    file_pattern: List of files or patterns of file paths containing CSV
      records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    column_names: An optional list of strings that corresponds to the CSV
      columns, in order. One per column of the input record. If this is not
      provided, infers the column names from the first row of the records.
      These names will be the keys of the features dict of each dataset element.
    column_defaults: A optional list of default values for the CSV fields. One
      item per selected column of the input record. Each item in the list is
      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
      `Tensor` with one of the aforementioned types. The tensor can either be
      a scalar default value (if the column is optional), or an empty tensor (if
      the column is required). If a dtype is provided instead of a tensor, the
      column is also treated as required. If this list is not provided, tries
      to infer types based on reading the first num_rows_for_inference rows of
      files specified, and assumes all columns are optional, defaulting to `0`
      for numeric values and `""` for string values. If both this and
      `select_columns` are specified, these must have the same lengths, and
      `column_defaults` is assumed to be sorted in order of increasing column
      index.
    label_name: A optional string corresponding to the label column. If
      provided, the data for this column is returned as a separate `Tensor` from
      the features dictionary, so that the dataset complies with the format
      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
      function.
    select_columns: An optional list of integer indices or string column
      names, that specifies a subset of columns of CSV data to select. If
      column names are provided, these must correspond to names provided in
      `column_names` or inferred from the file header lines. When this argument
      is specified, only a subset of CSV columns will be parsed and returned,
      corresponding to the columns specified. Using this results in faster
      parsing and lower memory usage. If both this and `column_defaults` are
      specified, these must have the same lengths, and `column_defaults` is
      assumed to be sorted in order of increasing column index.
    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
      separate fields in a record.
    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
      double quotation marks as regular characters inside of the string fields.
    na_value: Additional string to recognize as NA/NaN.
    header: A bool that indicates whether the first rows of provided CSV files
      correspond to header lines with column names, and should not be included
      in the data.
    num_epochs: An int specifying the number of times this dataset is repeated.
      If None, cycles through the dataset forever.
    shuffle: A bool that indicates whether the input should be shuffled.
    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
      ensures better shuffling, but increases memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: An int specifying the number of feature
      batches to prefetch for performance improvement. Recommended value is the
      number of batches consumed per training step. Defaults to auto-tune.

    num_parallel_reads: Number of threads used to read CSV records from files.
      If >1, the results will be interleaved.
    sloppy: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    num_rows_for_inference: Number of rows of a file to use for type inference
      if record_defaults is not provided. If None, reads all the rows of all
      the files. Defaults to 100.
    compression_type: (Optional.) A `tf.string` scalar evaluating to one of
      `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.

  Returns:
    A dataset, where each element is a (features, labels) tuple that corresponds
    to a batch of `batch_size` CSV rows. The features dictionary maps feature
    column names to `Tensor`s containing the corresponding column data, and
    labels is a `Tensor` containing the column data for the label column
    specified by `label_name`.

  Raises:
    ValueError: If any of the arguments is malformed.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Clean arguments; figure out column names and defaults

  if column_names is None:
    if not header:
      raise ValueError("Cannot infer column names without a header line.")
    # If column names are not provided, infer from the header lines
    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
  if len(column_names) != len(set(column_names)):
    raise ValueError("Cannot have duplicate column names.")

  if select_columns is not None:
    select_columns = _get_sorted_col_indices(select_columns, column_names)

  if column_defaults is not None:
    column_defaults = [
        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
        for x in column_defaults
    ]
  else:
    # If column defaults are not provided, infer from records at graph
    # construction time
    column_defaults = _infer_column_defaults(
        filenames, len(column_names), field_delim, use_quote_delim, na_value,
        header, num_rows_for_inference, select_columns)

  if select_columns is not None and len(column_defaults) != len(select_columns):
    raise ValueError(
        "If specified, column_defaults and select_columns must have same "
        "length."
    )
  if select_columns is not None and len(column_names) > len(select_columns):
    # Pick the relevant subset of column names
    column_names = [column_names[i] for i in select_columns]

  if label_name is not None and label_name not in column_names:
    raise ValueError("`label_name` provided must be one of the columns.")

  def filename_to_dataset(filename):
    return CsvDataset(
        filename,
        record_defaults=column_defaults,
        field_delim=field_delim,
        use_quote_delim=use_quote_delim,
        na_value=na_value,
        select_cols=select_columns,
        header=header,
        compression_type=compression_type,
    )

  def map_fn(*columns):
    """Organizes columns into a features dictionary.

    Args:
      *columns: list of `Tensor`s corresponding to one csv record.
    Returns:
      An OrderedDict of feature names to values for that particular record. If
      label_name is provided, extracts the label feature to be returned as the
      second element of the tuple.
    """
    features = collections.OrderedDict(zip(column_names, columns))
    if label_name is not None:
      label = features.pop(label_name)
      return features, label
    return features

  # Read files sequentially (if num_parallel_reads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))

  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  # Apply batch before map for perf, because map has high overhead relative
  # to the size of the computation in each map.
  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
  # improve the shape inference, because it makes the batch dimension static.
  # It is safe to do this because in that case we are repeating the input
  # indefinitely, and all batches will be full-sized.
  dataset = dataset.batch(batch_size=batch_size,
                          drop_remainder=num_epochs is None)
  dataset = dataset_ops.MapDataset(
      dataset, map_fn, use_inter_op_parallelism=False)
  dataset = dataset.prefetch(prefetch_buffer_size)

  return dataset
 def _build_dataset():
   return dataset_ops.Dataset.range(10).map(_map_fn).apply(
       interleave_ops.parallel_interleave(_interleave_fn, 1))
Esempio n. 49
0
 def _build_dataset():
     return dataset_ops.Dataset.range(10).map(_map_fn).apply(
         interleave_ops.parallel_interleave(_interleave_fn, 1))
 def _build_ds(self, cycle_length, block_length, sloppy=False):
   return (dataset_ops.Dataset.from_tensor_slices(
       self.input_values).repeat(self.num_repeats).apply(
           interleave_ops.parallel_interleave(
               lambda x: dataset_ops.Dataset.range(10 * x, 11 * x),
               cycle_length, block_length, sloppy)))
 def dataset_fn():
   return dataset_ops.Dataset.range(1).repeat().apply(
       interleave_ops.parallel_interleave(
           _make_fake_dataset_fn(), cycle_length=10))
Esempio n. 52
0
def make_batched_features_dataset(file_pattern,
                                  batch_size,
                                  features,
                                  reader=core_readers.TFRecordDataset,
                                  label_key=None,
                                  reader_args=None,
                                  num_epochs=None,
                                  shuffle=True,
                                  shuffle_buffer_size=10000,
                                  shuffle_seed=None,
                                  prefetch_buffer_size=optimization.AUTOTUNE,
                                  reader_num_threads=1,
                                  parser_num_threads=2,
                                  sloppy_ordering=False,
                                  drop_final_batch=False):
  """Returns a `Dataset` of feature dictionaries from `Example` protos.

  If label_key argument is provided, returns a `Dataset` of tuple
  comprising of feature dictionaries and label.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of records to combine
      in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    label_key: (Optional) A string corresponding to the key labels are stored in
      `tf.Examples`. If provided, it must be one of the `features` key,
      otherwise results in `ValueError`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step. Defaults to auto-tune.
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements, (or a tuple of `dict` elements and label).
    Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.

  Raises:
    ValueError: If `label_key` is not one of the `features` keys.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Read `Example` records from files as tensor objects.
  if reader_args is None:
    reader_args = []

  # Read files sequentially (if reader_num_threads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          lambda filename: reader(filename, *reader_args),
          cycle_length=reader_num_threads,
          sloppy=sloppy_ordering))

  # Extract values if the `Example` tensors are stored as key-value tuples.
  if dataset.output_types == (dtypes.string, dtypes.string):
    dataset = dataset_ops.MapDataset(
        dataset, lambda _, v: v, use_inter_op_parallelism=False)

  # Apply dataset repeat and shuffle transformations.
  dataset = _maybe_shuffle_and_repeat(
      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)

  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
  # improve the shape inference, because it makes the batch dimension static.
  # It is safe to do this because in that case we are repeating the input
  # indefinitely, and all batches will be full-sized.
  dataset = dataset.batch(
      batch_size, drop_remainder=drop_final_batch or num_epochs is None)

  # Parse `Example` tensors to a dictionary of `Feature` tensors.
  dataset = dataset.apply(
      parsing_ops.parse_example_dataset(
          features, num_parallel_calls=parser_num_threads))

  if label_key:
    if label_key not in features:
      raise ValueError(
          "The `label_key` provided (%r) must be one of the `features` keys." %
          label_key)
    dataset = dataset.map(lambda x: (x, x.pop(label_key)))

  dataset = dataset.prefetch(prefetch_buffer_size)
  return dataset
Esempio n. 53
0
def StreamingFilesDataset(files,
                          filetype=None,
                          file_reader_job=None,
                          worker_job=None,
                          num_epochs=None,
                          filename_shuffle_buffer_size=None,
                          num_parallel_reads=None,
                          batch_transfer_size=None,
                          sloppy=None):
  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).

  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
  files local to your GCE VM. In order to train using files stored on your local
  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
  helper to generate a dataset to feed your Cloud TPU with files from your GCE
  VM.

  The resulting dataset may return an OutOfRangeError if there are no files
  found as a result of the fileglob expansion.

  Note: StreamingFilesDataset assumes that the session is using a
  TPUClusterResolver and has therefore a worker and a coordinator job. File
  loading will be done on the coordinator job.

  Args:
    files: A string glob to match files, or a `tf.data.Dataset` generating file
      names.
    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
      TensorFlow function that when given a filename returns a dataset.
    file_reader_job: An optional string that corresponds to the job that should
      perform the file reads.
    worker_job: An optional string that corresponds to the job that should
      process the tensors (i.e. your GPU or TPU worker).
    num_epochs: The number of epochs through the training set that should be
      generated. By default, it will repeat infinitely.
    filename_shuffle_buffer_size: An optional integer whose value controls the
      shuffling of the file names. If you would like to read from the files in
      the same order, set to 0 or False.
    num_parallel_reads: An optional integer controlling the number of files to
      read from concurrently. (Set to 1 for no parallelism.)
    batch_transfer_size: An optional integer controlling the batching used to
      amortize the remote function invocation overhead. Set to a very large
      number to increase throughput. Set to a very small number to reduce memory
      consumption. Set to False to skip batching.
    sloppy: (Optional.) If `False`, read input data while maintaining a
      deterministic order. (This may have significant performance impacts.)
      sloppy defaults to: True.
  Returns:
    A `tf.data.Dataset` with an infinite stream of elements generated by a
    parallel interleaving of the set of files matched (or generated) by `files`
    with a type is the output of the dataset specified by `filetype`.

  Raises:
    ValueError: if any argument is not of the expected type.
  """
  if filetype is None:
    filetype = 'tfrecord'

  if isinstance(filetype, str):
    if filetype not in _FILETYPE_MAP:
      raise ValueError('Unexpected filetype: %s' % filetype)
    reader_fn = _FILETYPE_MAP[filetype]
  elif callable(filetype):
    reader_fn = filetype
  else:
    raise ValueError('filetype should be a string or a callable')

  file_reader_job = file_reader_job or 'coordinator'

  worker_job = worker_job or 'worker'

  if filename_shuffle_buffer_size is None:
    filename_shuffle_buffer_size = 4096

  num_parallel_reads = num_parallel_reads or 8

  if batch_transfer_size is None:
    batch_transfer_size = 256

  if sloppy is None:
    sloppy = True

  with ops.device('/job:%s' % file_reader_job):
    if isinstance(files, str):
      source_dataset = dataset_ops.Dataset.list_files(files)
    elif isinstance(files, dataset_ops.DatasetV2):
      source_dataset = files
    else:
      raise ValueError('files was not a string or a dataset: %s' % files)

    if filename_shuffle_buffer_size:
      source_dataset = source_dataset.shuffle(
          buffer_size=filename_shuffle_buffer_size)

    source_dataset = source_dataset.apply(
        interleave_ops.parallel_interleave(
            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))

    source_dataset = source_dataset.repeat(num_epochs)

    if batch_transfer_size:
      source_dataset = source_dataset.batch(batch_transfer_size)

    source_dataset = source_dataset.prefetch(1)

    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
    source_handle = source_iterator.string_handle()

  @function.Defun(dtypes.string)
  def LoadingFunc(h):
    remote_iterator = iterator_ops.Iterator.from_string_handle(
        h, source_dataset.output_types, source_dataset.output_shapes)
    return remote_iterator.get_next()

  def MapFn(unused_input):
    if isinstance(source_dataset.output_types, dtypes.DType):
      output_types = [source_dataset.output_types]
    elif isinstance(source_dataset.output_types, (list, tuple)):
      output_types = source_dataset.output_types
    else:
      raise ValueError('source dataset has invalid output types')
    remote_calls = functional_ops.remote_call(
        args=[source_handle],
        Tout=output_types,
        f=LoadingFunc,
        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
    if len(remote_calls) == 1:
      return remote_calls[0]
    else:
      return remote_calls

  with ops.device('/job:%s' % worker_job):
    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
        MapFn, num_parallel_calls=4 if sloppy else None)
    output_dataset = output_dataset.prefetch(1)

    if batch_transfer_size:
      # Undo the batching used during the transfer.
      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)

  return output_dataset