def testNonSequenceNestedStructure(self):
    components = np.array([1, 2, 3], dtype=np.int64)

    dataset = dataset_ops.Dataset.from_tensors(components)
    self.assertEqual(dtypes.int64,
                     dataset_ops.get_legacy_output_types(dataset))
    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))

    dataset = dataset.filter(
        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
    self.assertEqual(dtypes.int64,
                     dataset_ops.get_legacy_output_types(dataset))
    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))

    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
    self.assertEqual(dtypes.int64,
                     dataset_ops.get_legacy_output_types(dataset))
    self.assertEqual([2, 3], dataset_ops.get_legacy_output_shapes(dataset))

    dataset = dataset.flat_map(
        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
    self.assertEqual(dtypes.int64,
                     dataset_ops.get_legacy_output_types(dataset))
    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))

    get_next = self.getNext(dataset)
    self.assertEqual(dtypes.int64, get_next().dtype)
    self.assertEqual([3], get_next().shape)
Beispiel #2
0
  def testUnbatchScalarDataset(self):
    data = tuple([math_ops.range(10) for _ in range(3)])
    data = dataset_ops.Dataset.from_tensor_slices(data)
    expected_types = (dtypes.int32,) * 3
    data = data.batch(2)
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
    data = data.apply(batching.unbatch())
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))

    self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
Beispiel #3
0
  def testUnbatchDatasetWithStrings(self):
    data = tuple([math_ops.range(10) for _ in range(3)])
    data = dataset_ops.Dataset.from_tensor_slices(data)
    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
    data = data.batch(2)
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
    data = data.apply(batching.unbatch())
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))

    self.assertDatasetProduces(
        data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
 def testNestedDict(self):
   components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
   dataset = dataset_ops.Dataset.from_tensors(components)
   self.assertEqual(dtypes.int32,
                    dataset_ops.get_legacy_output_types(dataset)["a"]["aa"])
   self.assertEqual(dtypes.float32,
                    dataset_ops.get_legacy_output_types(dataset)["a"]["ab"])
   self.assertEqual(dtypes.int32,
                    dataset_ops.get_legacy_output_types(dataset)["b"])
   self.assertEqual([],
                    dataset_ops.get_legacy_output_shapes(dataset)["a"]["aa"])
   self.assertEqual([2],
                    dataset_ops.get_legacy_output_shapes(dataset)["a"]["ab"])
   self.assertEqual([3],
                    dataset_ops.get_legacy_output_shapes(dataset)["b"])
Beispiel #5
0
  def testUnbatchMultiElementTupleDataset(self):
    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                   array_ops.fill([10], "hi")) for i in range(3)])
    data = dataset_ops.Dataset.from_tensor_slices(data)
    expected_types = ((dtypes.int32, dtypes.string),) * 3
    data = data.batch(2)
    self.assertAllEqual(expected_types,
                        dataset_ops.get_legacy_output_types(data))
    data = data.apply(batching.unbatch())
    self.assertAllEqual(expected_types,
                        dataset_ops.get_legacy_output_types(data))

    self.assertDatasetProduces(
        data,
        [((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")) for i in range(10)])
Beispiel #6
0
    def _apply_fn(dataset):
        """Function from `Dataset` to `Dataset` that applies the transformation."""

        # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
        # are normalized to the rank-1 dense representation, so that the
        # sparse-oblivious unbatching logic will slice them
        # appropriately. This leads to a somewhat inefficient re-encoding step
        # for all SparseTensor components.
        # TODO(mrry): Consider optimizing this in future if it turns out to be
        # a bottleneck.
        def normalize(arg, *rest):
            # pylint: disable=protected-access
            if rest:
                return dataset._element_structure._to_batched_tensor_list(
                    (arg, ) + rest)
            else:
                return dataset._element_structure._to_batched_tensor_list(arg)

        normalized_dataset = dataset.map(normalize)

        # NOTE(mrry): Our `map()` has lost information about the sparseness
        # of any SparseTensor components, so re-apply the structure of the
        # original dataset.
        restructured_dataset = _RestructuredDataset(
            normalized_dataset,
            dataset_ops.get_legacy_output_types(dataset),
            dataset_ops.get_legacy_output_shapes(dataset),
            dataset_ops.get_legacy_output_classes(dataset),
            allow_unsafe_cast=True)
        return _UnbatchDataset(restructured_dataset)
Beispiel #7
0
    def testIteratorStringHandleReuseTensorObject(self):
        dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
        one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
        initializable_iterator = dataset_ops.make_initializable_iterator(
            dataset)
        structure_iterator = iterator_ops.Iterator.from_structure(
            dataset_ops.get_legacy_output_types(dataset))

        created_ops = len(ops.get_default_graph().get_operations())

        self.assertIs(one_shot_iterator.string_handle(),
                      one_shot_iterator.string_handle())
        self.assertIs(initializable_iterator.string_handle(),
                      initializable_iterator.string_handle())
        self.assertIs(structure_iterator.string_handle(),
                      structure_iterator.string_handle())

        # Assert that getting the (default) string handle creates no ops.
        self.assertEqual(created_ops,
                         len(ops.get_default_graph().get_operations()))

        # Specifying an explicit name will create a new op.
        handle_with_name = one_shot_iterator.string_handle(name="foo")
        self.assertEqual("foo", handle_with_name.op.name)
        self.assertIsNot(one_shot_iterator.string_handle(), handle_with_name)

        handle_with_same_name = one_shot_iterator.string_handle(name="foo")
        self.assertEqual("foo_1", handle_with_same_name.op.name)
        self.assertIsNot(handle_with_name, handle_with_same_name)
    def testUnbatchMultiElementTupleDataset(self):
        data = tuple([(math_ops.range(10 * i,
                                      10 * i + 10), array_ops.fill([10], "hi"))
                      for i in range(3)])
        data = dataset_ops.Dataset.from_tensor_slices(data)
        expected_types = ((dtypes.int32, dtypes.string), ) * 3
        data = data.batch(2)
        self.assertAllEqual(expected_types,
                            dataset_ops.get_legacy_output_types(data))
        data = data.unbatch()
        self.assertAllEqual(expected_types,
                            dataset_ops.get_legacy_output_types(data))

        self.assertDatasetProduces(data, [((i, b"hi"), (10 + i, b"hi"),
                                           (20 + i, b"hi"))
                                          for i in range(10)])
Beispiel #9
0
    def assertDatasetsEqual(self, dataset1, dataset2):
        """Checks that datasets are equal. Supports both graph and eager mode."""
        self.assertTrue(
            structure.are_compatible(dataset_ops.get_structure(dataset1),
                                     dataset_ops.get_structure(dataset2)))

        flattened_types = nest.flatten(
            dataset_ops.get_legacy_output_types(dataset1))

        next1 = self.getNext(dataset1)
        next2 = self.getNext(dataset2)

        while True:
            try:
                op1 = self.evaluate(next1())
            except errors.OutOfRangeError:
                with self.assertRaises(errors.OutOfRangeError):
                    self.evaluate(next2())
                break
            op2 = self.evaluate(next2())

            op1 = nest.flatten(op1)
            op2 = nest.flatten(op2)
            assert len(op1) == len(op2)
            for i in range(len(op1)):
                if sparse_tensor.is_sparse(op1[i]):
                    self.assertSparseValuesEqual(op1[i], op2[i])
                elif ragged_tensor.is_ragged(op1[i]):
                    self.assertAllEqual(op1[i], op2[i])
                elif flattened_types[i] == dtypes.string:
                    self.assertAllEqual(op1[i], op2[i])
                else:
                    self.assertAllClose(op1[i], op2[i])
Beispiel #10
0
def _create_or_validate_filenames_dataset(filenames, name=None):
    """Creates (or validates) a dataset of filenames.

  Args:
    filenames: Either a list or dataset of filenames. If it is a list, it is
      convert to a dataset. If it is a dataset, its type and shape is validated.
    name: (Optional.) A name for the tf.data operation.

  Returns:
    A dataset of filenames.
  """
    if isinstance(filenames, dataset_ops.DatasetV2):
        element_type = dataset_ops.get_legacy_output_types(filenames)
        if element_type != dtypes.string:
            raise TypeError(
                "The `filenames` argument must contain `tf.string` elements. Got a "
                f"dataset of `{element_type!r}` elements.")
        element_shape = dataset_ops.get_legacy_output_shapes(filenames)
        if not element_shape.is_compatible_with(tensor_shape.TensorShape([])):
            raise TypeError(
                "The `filenames` argument must contain `tf.string` elements of shape "
                "[] (i.e. scalars). Got a dataset of element shape "
                f"{element_shape!r}.")
    else:
        filenames = nest.map_structure(_normalise_fspath, filenames)
        filenames = ops.convert_to_tensor(filenames, dtype_hint=dtypes.string)
        if filenames.dtype != dtypes.string:
            raise TypeError(
                "The `filenames` argument must contain `tf.string` elements. Got "
                f"`{filenames.dtype!r}` elements.")
        filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
        filenames = dataset_ops.TensorSliceDataset(filenames,
                                                   is_files=True,
                                                   name=name)
    return filenames
Beispiel #11
0
def _create_or_validate_filenames_dataset(filenames):
  """Creates (or validates) a dataset of filenames.

  Args:
    filenames: Either a list or dataset of filenames. If it is a list, it is
      convert to a dataset. If it is a dataset, its type and shape is validated.

  Returns:
    A dataset of filenames.
  """
  if isinstance(filenames, dataset_ops.DatasetV2):
    if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
      raise TypeError(
          "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
    if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
        tensor_shape.scalar()):
      raise TypeError(
          "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
          "elements.")
  else:
    filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
    filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
    filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)

  return filenames
Beispiel #12
0
  def assertDatasetsEqual(self, dataset1, dataset2):
    """Checks that datasets are equal. Supports both graph and eager mode."""
    self.assertTrue(dataset_ops.get_structure(dataset1).is_compatible_with(
        dataset_ops.get_structure(dataset2)))
    self.assertTrue(dataset_ops.get_structure(dataset2).is_compatible_with(
        dataset_ops.get_structure(dataset1)))
    flattened_types = nest.flatten(
        dataset_ops.get_legacy_output_types(dataset1))

    next1 = self.getNext(dataset1)
    next2 = self.getNext(dataset2)

    while True:
      try:
        op1 = self.evaluate(next1())
      except errors.OutOfRangeError:
        with self.assertRaises(errors.OutOfRangeError):
          self.evaluate(next2())
        break
      op2 = self.evaluate(next2())

      op1 = nest.flatten(op1)
      op2 = nest.flatten(op2)
      assert len(op1) == len(op2)
      for i in range(len(op1)):
        if sparse_tensor.is_sparse(op1[i]):
          self.assertSparseValuesEqual(op1[i], op2[i])
        elif flattened_types[i] == dtypes.string:
          self.assertAllEqual(op1[i], op2[i])
        else:
          self.assertAllClose(op1[i], op2[i])
Beispiel #13
0
    def __init__(self, input_dataset):
        """See `unbatch()` for more details."""
        input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
        flat_shapes = nest.flatten(input_shapes)
        if any(s.ndims == 0 for s in flat_shapes):
            raise ValueError("Cannot unbatch an input with scalar components.")
        known_batch_dim = tensor_shape.Dimension(None)
        for s in flat_shapes:
            try:
                known_batch_dim = known_batch_dim.merge_with(s[0])
            except ValueError:
                raise ValueError(
                    "Cannot unbatch an input whose components have "
                    "different batch sizes.")
        self._input_dataset = input_dataset

        self._structure = structure.convert_legacy_structure(
            dataset_ops.get_legacy_output_types(input_dataset),
            nest.map_structure(lambda s: s[1:], input_shapes),
            dataset_ops.get_legacy_output_classes(input_dataset))

        variant_tensor = ged_ops.experimental_unbatch_dataset(
            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
            **dataset_ops.flat_structure(self))
        super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
Beispiel #14
0
  def _apply_fn(dataset):
    """Function from `Dataset` to `Dataset` that applies the transformation."""
    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
    # are normalized to the rank-1 dense representation, so that the
    # sparse-oblivious unbatching logic will slice them
    # appropriately. This leads to a somewhat inefficient re-encoding step
    # for all SparseTensor components.
    # TODO(mrry): Consider optimizing this in future if it turns out to be
    # a bottleneck.
    def normalize(arg, *rest):
      # pylint: disable=protected-access
      if rest:
        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
      else:
        return dataset._element_structure._to_batched_tensor_list(arg)

    normalized_dataset = dataset.map(normalize)

    # NOTE(mrry): Our `map()` has lost information about the sparseness
    # of any SparseTensor components, so re-apply the structure of the
    # original dataset.
    restructured_dataset = _RestructuredDataset(
        normalized_dataset,
        dataset_ops.get_legacy_output_types(dataset),
        dataset_ops.get_legacy_output_shapes(dataset),
        dataset_ops.get_legacy_output_classes(dataset),
        allow_unsafe_cast=True)
    return _UnbatchDataset(restructured_dataset)
Beispiel #15
0
  def __init__(self, input_dataset, num_workers):
    self._input_dataset = input_dataset

    def recalculate_output_shapes(output_shapes):
      """Recalculates the output_shapes after dividing it by num_workers."""
      if len(output_shapes) < 1:
        raise ValueError("Input shape should have at least one dimension.")
      if (tensor_shape.dimension_value(output_shapes[0]) and
          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
        raise errors.InvalidArgumentError(
            None, None,
            "First dim of input shape: %d is not divisible by num_workers: %d" %
            (output_shapes[0], num_workers))
      output_dims = [d for d in output_shapes.dims]
      output_dims[0] = output_dims[0] // num_workers
      return tensor_shape.TensorShape(output_dims)

    input_types = dataset_ops.get_legacy_output_types(self._input_dataset)
    input_shapes = dataset_ops.get_legacy_output_shapes(self._input_dataset)
    input_classes = dataset_ops.get_legacy_output_classes(self._input_dataset)
    output_shapes = nest.map_structure(recalculate_output_shapes, input_shapes)

    self._structure = structure.convert_legacy_structure(
        input_types, output_shapes, input_classes)
    variant_tensor = ged_ops.experimental_rebatch_dataset(
        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
        num_workers=num_workers,
        **dataset_ops.flat_structure(self))
    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
Beispiel #16
0
def _create_or_validate_filenames_dataset(filenames):
  """Creates (or validates) a dataset of filenames.

  Args:
    filenames: Either a list or dataset of filenames. If it is a list, it is
      convert to a dataset. If it is a dataset, its type and shape is validated.

  Returns:
    A dataset of filenames.
  """
  if isinstance(filenames, dataset_ops.DatasetV2):
    if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
      raise TypeError(
          "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
    if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
        tensor_shape.TensorShape([])):
      raise TypeError(
          "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
          "elements.")
  else:
    filenames = ops.convert_to_tensor(filenames, dtype_hint=dtypes.string)
    if filenames.dtype != dtypes.string:
      raise TypeError(
          "`filenames` must be a `tf.Tensor` of dtype `tf.string` dtype."
          " Got {}".format(filenames.dtype))
    filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
    filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)

  return filenames
Beispiel #17
0
  def write(self, dataset, column_families, columns, timestamp=None):
    """Writes a dataset to the table.

    Args:
      dataset: A `tf.data.Dataset` to be written to this table. It must produce
        a list of number-of-columns+1 elements, all of which must be strings.
        The first value will be used as the row key, and subsequent values will
        be used as cell values for the corresponding columns from the
        corresponding column_families and columns entries.
      column_families: A `tf.Tensor` of `tf.string`s corresponding to the
        column names to store the dataset's elements into.
      columns: A `tf.Tensor` of `tf.string`s corresponding to the column names
        to store the dataset's elements into.
      timestamp: (Optional.) An int64 timestamp to write all the values at.
        Leave as None to use server-provided timestamps.

    Returns:
      A `tf.Operation` that can be run to perform the write.

    Raises:
      ValueError: If there are unexpected or incompatible types, or if the
        number of columns and column_families does not match the output of
        `dataset`.
    """
    if timestamp is None:
      timestamp = -1  # Bigtable server provided timestamp.
    for tensor_type in nest.flatten(
        dataset_ops.get_legacy_output_types(dataset)):
      if tensor_type != dtypes.string:
        raise ValueError("Not all elements of the dataset were `tf.string`")
    for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
      if not shape.is_compatible_with(tensor_shape.scalar()):
        raise ValueError("Not all elements of the dataset were scalars")
    if len(column_families) != len(columns):
      raise ValueError("len(column_families) != len(columns)")
    if len(nest.flatten(
        dataset_ops.get_legacy_output_types(dataset))) != len(columns) + 1:
      raise ValueError("A column name must be specified for every component of "
                       "the dataset elements. (e.g.: len(columns) != "
                       "len(dataset.output_types))")
    return gen_bigtable_ops.dataset_to_bigtable(
        self._resource,
        dataset._variant_tensor,  # pylint: disable=protected-access
        column_families,
        columns,
        timestamp)
Beispiel #18
0
 def testNegativeStep(self, output_type):
   start, stop, step = 2, 10, -1
   dataset = dataset_ops.Dataset.range(
       start, stop, step, output_type=output_type)
   expected_output = np.arange(
       start, stop, step, dtype=output_type.as_numpy_dtype)
   self.assertDatasetProduces(dataset, expected_output=expected_output)
   self.assertEqual(output_type, dataset_ops.get_legacy_output_types(dataset))
Beispiel #19
0
  def testUnbatchDatasetWithUintDtypes(self):
    components = (
        np.tile(np.array([[0], [1], [2], [3]], dtype=np.uint8), 2),
        np.tile(np.array([[1], [2], [3], [256]], dtype=np.uint16), 2),
        np.tile(np.array([[2], [3], [4], [65536]], dtype=np.uint32), 2),
        np.tile(np.array([[3], [4], [5], [4294967296]], dtype=np.uint64), 2),
    )
    expected_types = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
    expected_output = [tuple([c[i] for c in components]) for i in range(4)]

    data = dataset_ops.Dataset.from_tensor_slices(components)
    data = data.batch(2)
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))

    data = data.unbatch()
    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
    self.assertDatasetProduces(data, expected_output)
Beispiel #20
0
 def testStopLessThanStartWithPositiveStep(self, output_type):
   start, stop, step = 10, 2, 2
   dataset = dataset_ops.Dataset.range(
       start, stop, step, output_type=output_type)
   expected_output = np.arange(
       start, stop, step, dtype=output_type.as_numpy_dtype)
   self.assertDatasetProduces(dataset, expected_output=expected_output)
   self.assertEqual(output_type, dataset_ops.get_legacy_output_types(dataset))
Beispiel #21
0
  def write(self, dataset, column_families, columns, timestamp=None):
    """Writes a dataset to the table.

    Args:
      dataset: A `tf.data.Dataset` to be written to this table. It must produce
        a list of number-of-columns+1 elements, all of which must be strings.
        The first value will be used as the row key, and subsequent values will
        be used as cell values for the corresponding columns from the
        corresponding column_families and columns entries.
      column_families: A `tf.Tensor` of `tf.string`s corresponding to the
        column names to store the dataset's elements into.
      columns: A `tf.Tensor` of `tf.string`s corresponding to the column names
        to store the dataset's elements into.
      timestamp: (Optional.) An int64 timestamp to write all the values at.
        Leave as None to use server-provided timestamps.

    Returns:
      A `tf.Operation` that can be run to perform the write.

    Raises:
      ValueError: If there are unexpected or incompatible types, or if the
        number of columns and column_families does not match the output of
        `dataset`.
    """
    if timestamp is None:
      timestamp = -1  # Bigtable server provided timestamp.
    for tensor_type in nest.flatten(
        dataset_ops.get_legacy_output_types(dataset)):
      if tensor_type != dtypes.string:
        raise ValueError("Not all elements of the dataset were `tf.string`")
    for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
      if not shape.is_compatible_with(tensor_shape.scalar()):
        raise ValueError("Not all elements of the dataset were scalars")
    if len(column_families) != len(columns):
      raise ValueError("len(column_families) != len(columns)")
    if len(nest.flatten(
        dataset_ops.get_legacy_output_types(dataset))) != len(columns) + 1:
      raise ValueError("A column name must be specified for every component of "
                       "the dataset elements. (e.g.: len(columns) != "
                       "len(dataset.output_types))")
    return gen_bigtable_ops.dataset_to_bigtable(
        self._resource,
        dataset._variant_tensor,  # pylint: disable=protected-access
        column_families,
        columns,
        timestamp)
Beispiel #22
0
  def testKinesisDatasetTwoShards(self):
    client = boto3.client('kinesis', region_name='us-east-1')

    # Setup the Kinesis with 2 shards.
    stream_name = "tf_kinesis_test_2"
    client.create_stream(StreamName=stream_name, ShardCount=2)
    # Wait until stream exists, default is 10 * 18 seconds.
    client.get_waiter('stream_exists').wait(StreamName=stream_name)

    for i in range(10):
      data = "D" + str(i)
      client.put_record(
          StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i))
    response = client.describe_stream(StreamName=stream_name)
    shard_id_0 = response["StreamDescription"]["Shards"][0]["ShardId"]
    shard_id_1 = response["StreamDescription"]["Shards"][1]["ShardId"]

    stream = array_ops.placeholder(dtypes.string, shape=[])
    shard = array_ops.placeholder(dtypes.string, shape=[])
    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
    batch_size = array_ops.placeholder(dtypes.int64, shape=[])

    repeat_dataset = kinesis_dataset_ops.KinesisDataset(
        stream, shard, read_indefinitely=False).repeat(num_epochs)
    batch_dataset = repeat_dataset.batch(batch_size)

    iterator = iterator_ops.Iterator.from_structure(
        dataset_ops.get_legacy_output_types(batch_dataset))
    init_op = iterator.make_initializer(repeat_dataset)
    init_batch_op = iterator.make_initializer(batch_dataset)
    get_next = iterator.get_next()

    data = list()
    with self.cached_session() as sess:
      # Basic test: read from shard 0 of stream 2.
      sess.run(
          init_op, feed_dict={
              stream: stream_name, shard: shard_id_0, num_epochs: 1})
      with self.assertRaises(errors.OutOfRangeError):
        # Use range(11) to guarantee the OutOfRangeError.
        for i in range(11):
          data.append(sess.run(get_next))

      # Basic test: read from shard 1 of stream 2.
      sess.run(
          init_op, feed_dict={
              stream: stream_name, shard: shard_id_1, num_epochs: 1})
      with self.assertRaises(errors.OutOfRangeError):
        # Use range(11) to guarantee the OutOfRangeError.
        for i in range(11):
          data.append(sess.run(get_next))

    data.sort()
    self.assertEqual(data, ["D" + str(i) for i in range(10)])

    client.delete_stream(StreamName=stream_name)
    # Wait until stream deleted, default is 10 * 18 seconds.
    client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
Beispiel #23
0
def create_tf_dataset(file_pattern,
                      spec,
                      num_epochs=1,
                      shuffle=False,
                      shuffle_seed=None,
                      shuffle_buffer_size=None,
                      reader_num_threads=None,
                      parser_num_threads=None,
                      prefetch_buffer_size=None):
    reader = _gzip_reader_fn

    if reader_num_threads is None:
        reader_num_threads = 1
    if parser_num_threads is None:
        parser_num_threads = 2
    if prefetch_buffer_size is None:
        prefetch_buffer_size = dataset_ops.AUTOTUNE

    # Create dataset of all matching filenames
    dataset = dataset_ops.Dataset.list_files(file_pattern=file_pattern,
                                             shuffle=shuffle,
                                             seed=shuffle_seed)

    if reader_num_threads == dataset_ops.AUTOTUNE:
        dataset = dataset.interleave(lambda filename: reader(filename),
                                     num_parallel_calls=reader_num_threads)
        options = dataset_ops.Options()
        options.experimental_deterministic = True
        dataset = dataset.with_options(options)
    else:

        def apply_fn(dataset):
            return core_readers.ParallelInterleaveDataset(
                dataset,
                lambda filename: reader(filename),
                cycle_length=reader_num_threads,
                block_length=1,
                sloppy=True,
                buffer_output_elements=None,
                prefetch_input_elements=None)

        dataset = dataset.apply(apply_fn)

    if dataset_ops.get_legacy_output_types(dataset) == (dtypes.string,
                                                        dtypes.string):
        dataset = dataset_ops.MapDataset(dataset,
                                         lambda _, v: v,
                                         use_inter_op_parallelism=False)

    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
    if num_epochs != 1:
        dataset = dataset.repeat(num_epochs)

    dataset = dataset.map(lambda x: tf.io.parse_example(x, spec))
    dataset = dataset.map(_split_inputs_labels)
    dataset = dataset.prefetch(prefetch_buffer_size)
    return dataset
Beispiel #24
0
  def testKinesisDatasetTwoShards(self):
    client = boto3.client('kinesis', region_name='us-east-1')

    # Setup the Kinesis with 2 shards.
    stream_name = "tf_kinesis_test_2"
    client.create_stream(StreamName=stream_name, ShardCount=2)
    # Wait until stream exists, default is 10 * 18 seconds.
    client.get_waiter('stream_exists').wait(StreamName=stream_name)

    for i in range(10):
      data = "D" + str(i)
      client.put_record(
          StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i))
    response = client.describe_stream(StreamName=stream_name)
    shard_id_0 = response["StreamDescription"]["Shards"][0]["ShardId"]
    shard_id_1 = response["StreamDescription"]["Shards"][1]["ShardId"]

    stream = array_ops.placeholder(dtypes.string, shape=[])
    shard = array_ops.placeholder(dtypes.string, shape=[])
    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
    batch_size = array_ops.placeholder(dtypes.int64, shape=[])

    repeat_dataset = kinesis_dataset_ops.KinesisDataset(
        stream, shard, read_indefinitely=False).repeat(num_epochs)
    batch_dataset = repeat_dataset.batch(batch_size)

    iterator = iterator_ops.Iterator.from_structure(
        dataset_ops.get_legacy_output_types(batch_dataset))
    init_op = iterator.make_initializer(repeat_dataset)
    init_batch_op = iterator.make_initializer(batch_dataset)
    get_next = iterator.get_next()

    data = []
    with self.cached_session() as sess:
      # Basic test: read from shard 0 of stream 2.
      sess.run(
          init_op, feed_dict={
              stream: stream_name, shard: shard_id_0, num_epochs: 1})
      with self.assertRaises(errors.OutOfRangeError):
        # Use range(11) to guarantee the OutOfRangeError.
        for i in range(11):
          data.append(sess.run(get_next))

      # Basic test: read from shard 1 of stream 2.
      sess.run(
          init_op, feed_dict={
              stream: stream_name, shard: shard_id_1, num_epochs: 1})
      with self.assertRaises(errors.OutOfRangeError):
        # Use range(11) to guarantee the OutOfRangeError.
        for i in range(11):
          data.append(sess.run(get_next))

    data.sort()
    self.assertEqual(data, ["D" + str(i) for i in range(10)])

    client.delete_stream(StreamName=stream_name)
    # Wait until stream deleted, default is 10 * 18 seconds.
    client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
Beispiel #25
0
    def testIteratorStringHandle(self):
        dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
        dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])

        iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
        iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)

        handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
        feedable_iterator = iterator_ops.Iterator.from_string_handle(
            handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3),
            dataset_ops.get_legacy_output_shapes(dataset_3))
        next_element = feedable_iterator.get_next()

        self.assertTrue(
            dataset_ops.get_structure(dataset_3).is_compatible_with(
                dataset_ops.get_structure(feedable_iterator)))
        self.assertTrue(
            dataset_ops.get_structure(dataset_4).is_compatible_with(
                dataset_ops.get_structure(feedable_iterator)))

        with self.cached_session() as sess:
            iterator_3_handle = sess.run(iterator_3.string_handle())
            iterator_4_handle = sess.run(iterator_4.string_handle())

            self.assertEqual(
                10,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_4_handle}))
            self.assertEqual(
                1,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_3_handle}))
            self.assertEqual(
                20,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_4_handle}))
            self.assertEqual(
                2,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_3_handle}))
            self.assertEqual(
                30,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_4_handle}))
            self.assertEqual(
                3,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_3_handle}))
            self.assertEqual(
                40,
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_4_handle}))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_3_handle})
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(next_element,
                         feed_dict={handle_placeholder: iterator_4_handle})
Beispiel #26
0
    def __init__(self,
                 filenames,
                 compression_type=None,
                 buffer_size=None,
                 num_parallel_reads=None):
        """Creates a `TFRecordDataset` to read one or more TFRecord files.

    NOTE: The `num_parallel_reads` argument can be used to improve performance
    when reading from a remote filesystem.

    Args:
      filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or
        more filenames.
      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
        bytes in the read buffer. 0 means no buffering.
      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
        number of files to read in parallel. Defaults to reading files
        sequentially.

    Raises:
      TypeError: If any argument does not have the expected type.
      ValueError: If any argument does not have the expected shape.
    """
        if isinstance(filenames, dataset_ops.DatasetV2):
            if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
                raise TypeError(
                    "`filenames` must be a `tf.data.Dataset` of `tf.string` elements."
                )
            if not dataset_ops.get_legacy_output_shapes(
                    filenames).is_compatible_with(tensor_shape.scalar()):
                raise ValueError(
                    "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
                    "elements.")
        else:
            filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
            filenames = array_ops.reshape(filenames, [-1],
                                          name="flat_filenames")
            filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)

        self._filenames = filenames
        self._compression_type = compression_type
        self._buffer_size = buffer_size
        self._num_parallel_reads = num_parallel_reads

        def read_one_file(filename):
            return _TFRecordDataset(filename, compression_type, buffer_size)

        if num_parallel_reads is None:
            self._impl = filenames.flat_map(read_one_file)
        else:
            self._impl = filenames.interleave(
                read_one_file,
                cycle_length=num_parallel_reads,
                num_parallel_calls=num_parallel_reads)
        variant_tensor = self._impl._variant_tensor  # pylint: disable=protected-access
        super(TFRecordDatasetV2, self).__init__(variant_tensor)
Beispiel #27
0
 def batch_init_fn(_):
   indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]],
                                    0)
   return sparse_tensor.SparseTensor(
       indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
       values=constant_op.constant(
           [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
       dense_shape=array_ops.concat(
           [np.array([0], dtype=np.int64), padded_shape], 0))
Beispiel #28
0
 def batch_init_fn(_):
   indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]],
                                    0)
   return sparse_tensor.SparseTensor(
       indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
       values=constant_op.constant(
           [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
       dense_shape=array_ops.concat(
           [np.array([0], dtype=np.int64), padded_shape], 0))
  def testFromTensorSlicesWithDict(self):
    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
    dataset = dataset_ops.Dataset.from_tensor_slices(components)
    get_next = self.getNext(dataset)

    self.assertEqual(dtypes.int32,
                     dataset_ops.get_legacy_output_types(dataset)["foo"])
    self.assertEqual(dtypes.float32,
                     dataset_ops.get_legacy_output_types(dataset)["bar"])
    self.assertEqual((), dataset_ops.get_legacy_output_shapes(dataset)["foo"])
    self.assertEqual((1,), dataset_ops.get_legacy_output_shapes(dataset)["bar"])

    for i in range(3):
      results = self.evaluate(get_next())
      self.assertEqual(components["foo"][i], results["foo"])
      self.assertEqual(components["bar"][i], results["bar"])
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
Beispiel #30
0
 def testDictInputs(self):
     elements = [{
         "foo": [1, 2, 3],
         "bar": [[4.0], [5.0], [6.0]]
     }, {
         "foo": [4, 5, 6],
         "bar": [[7.0], [8.0], [9.0]]
     }]
     dataset = from_list.from_list(elements)
     self.assertEqual(dtypes.int32,
                      dataset_ops.get_legacy_output_types(dataset)["foo"])
     self.assertEqual(dtypes.float32,
                      dataset_ops.get_legacy_output_types(dataset)["bar"])
     self.assertEqual((3, ),
                      dataset_ops.get_legacy_output_shapes(dataset)["foo"])
     self.assertEqual((3, 1),
                      dataset_ops.get_legacy_output_shapes(dataset)["bar"])
     self.assertDatasetProduces(dataset, expected_output=elements)
Beispiel #31
0
 def _apply_fn(dataset):
   output_shapes = _merge_output_shapes(
       dataset_ops.get_legacy_output_shapes(dataset), expected_shapes)
   # pylint: disable=protected-access
   return batching._RestructuredDataset(
       dataset.map(_check_shape),
       dataset_ops.get_legacy_output_types(dataset),
       output_shapes=output_shapes,
       output_classes=dataset_ops.get_legacy_output_classes(dataset))
 def testCounter(self, start, step, expected_output):
   """Test dataset construction using `count`."""
   dataset = counter.Counter(start, step)
   self.assertEqual(
       [], dataset_ops.get_legacy_output_shapes(dataset).as_list())
   self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset))
   get_next = self.getNext(dataset)
   for expected in expected_output:
     self.assertEqual(expected, self.evaluate(get_next()))
  def testFromTensorSlicesWithDict(self):
    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
    dataset = dataset_ops.Dataset.from_tensor_slices(components)
    get_next = self.getNext(dataset)

    self.assertEqual(dtypes.int32,
                     dataset_ops.get_legacy_output_types(dataset)["foo"])
    self.assertEqual(dtypes.float32,
                     dataset_ops.get_legacy_output_types(dataset)["bar"])
    self.assertEqual((), dataset_ops.get_legacy_output_shapes(dataset)["foo"])
    self.assertEqual((1,), dataset_ops.get_legacy_output_shapes(dataset)["bar"])

    for i in range(3):
      results = self.evaluate(get_next())
      self.assertEqual(components["foo"][i], results["foo"])
      self.assertEqual(components["bar"][i], results["bar"])
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
Beispiel #34
0
 def _apply_fn(dataset):
     output_shapes = _merge_output_shapes(
         dataset_ops.get_legacy_output_shapes(dataset), expected_shapes)
     # pylint: disable=protected-access
     return batching._RestructuredDataset(
         dataset.map(_check_shape),
         dataset_ops.get_legacy_output_types(dataset),
         output_shapes=output_shapes,
         output_classes=dataset_ops.get_legacy_output_classes(dataset))
Beispiel #35
0
  def testIteratorStringHandle(self):
    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])

    iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
    iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)

    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
    feedable_iterator = iterator_ops.Iterator.from_string_handle(
        handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3),
        dataset_ops.get_legacy_output_shapes(dataset_3))
    next_element = feedable_iterator.get_next()

    self.assertTrue(dataset_ops.get_structure(dataset_3).is_compatible_with(
        dataset_ops.get_structure(feedable_iterator)))
    self.assertTrue(dataset_ops.get_structure(dataset_4).is_compatible_with(
        dataset_ops.get_structure(feedable_iterator)))

    with self.cached_session() as sess:
      iterator_3_handle = sess.run(iterator_3.string_handle())
      iterator_4_handle = sess.run(iterator_4.string_handle())

      self.assertEqual(10,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_4_handle}))
      self.assertEqual(1,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_3_handle}))
      self.assertEqual(20,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_4_handle}))
      self.assertEqual(2,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_3_handle}))
      self.assertEqual(30,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_4_handle}))
      self.assertEqual(3,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_3_handle}))
      self.assertEqual(40,
                       sess.run(
                           next_element,
                           feed_dict={handle_placeholder: iterator_4_handle}))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(
            next_element, feed_dict={handle_placeholder: iterator_3_handle})
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(
            next_element, feed_dict={handle_placeholder: iterator_4_handle})
Beispiel #36
0
  def testReinitializableIterator(self):
    dataset_3 = dataset_ops.Dataset.from_tensors(
        constant_op.constant([1, 2, 3]))
    dataset_4 = dataset_ops.Dataset.from_tensors(
        constant_op.constant([4, 5, 6, 7]))
    iterator = iterator_ops.Iterator.from_structure(
        dataset_ops.get_legacy_output_types(dataset_3), [None])

    dataset_3_init_op = iterator.make_initializer(dataset_3)
    dataset_4_init_op = iterator.make_initializer(dataset_4)
    get_next = iterator.get_next()

    self.assertEqual(
        dataset_ops.get_legacy_output_types(dataset_3),
        dataset_ops.get_legacy_output_types(iterator))
    self.assertEqual(
        dataset_ops.get_legacy_output_types(dataset_4),
        dataset_ops.get_legacy_output_types(iterator))
    self.assertEqual(
        [None], dataset_ops.get_legacy_output_shapes(iterator).as_list())

    with self.cached_session() as sess:
      # The iterator is initially uninitialized.
      with self.assertRaises(errors.FailedPreconditionError):
        sess.run(get_next)

      # Initialize with one dataset.
      sess.run(dataset_3_init_op)
      self.assertAllEqual([1, 2, 3], sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)

      # Initialize with a different dataset.
      sess.run(dataset_4_init_op)
      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)

      # Reinitialize with the first dataset.
      sess.run(dataset_3_init_op)
      self.assertAllEqual([1, 2, 3], sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
 def testNestedDict(self):
     components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
     dataset = dataset_ops.Dataset.from_tensors(components)
     self.assertEqual(
         dtypes.int32,
         dataset_ops.get_legacy_output_types(dataset)["a"]["aa"])
     self.assertEqual(
         dtypes.float32,
         dataset_ops.get_legacy_output_types(dataset)["a"]["ab"])
     self.assertEqual(dtypes.int32,
                      dataset_ops.get_legacy_output_types(dataset)["b"])
     self.assertEqual(
         [],
         dataset_ops.get_legacy_output_shapes(dataset)["a"]["aa"])
     self.assertEqual(
         [2],
         dataset_ops.get_legacy_output_shapes(dataset)["a"]["ab"])
     self.assertEqual([3],
                      dataset_ops.get_legacy_output_shapes(dataset)["b"])
Beispiel #38
0
    def testReinitializableIteratorEmptyDataset(self):
        dataset = dataset_ops.Dataset.range(0)
        iterator = iterator_ops.Iterator.from_structure(
            dataset_ops.get_legacy_output_types(dataset), [])
        init_op = iterator.make_initializer(dataset)

        with self.cached_session() as sess:
            sess.run(init_op)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(iterator.get_next())
Beispiel #39
0
  def __init__(self, filenames, compression_type=None, buffer_size=None,
               num_parallel_reads=None):
    """Creates a `TFRecordDataset` to read for one or more TFRecord files.

    Args:
      filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or
        more filenames.
      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
        bytes in the read buffer. If your input pipeline is I/O bottlenecked,
        consider setting this parameter to a value 1-100 MBs. If `None`, a
        sensible default for both local and remote file systems is used.
      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
        number of files to read in parallel. If greater than one, the records of
        files read in parallel are outputted in an interleaved order. If your
        input pipeline is I/O bottlenecked, consider setting this parameter to a
        value greater than one to parallelize the I/O. If `None`, files will be
        read sequentially.

    Raises:
      TypeError: If any argument does not have the expected type.
      ValueError: If any argument does not have the expected shape.
    """
    if isinstance(filenames, dataset_ops.DatasetV2):
      if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
        raise TypeError(
            "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
      if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
          tensor_shape.scalar()):
        raise ValueError(
            "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
            "elements.")
    else:
      filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
      filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)

    self._filenames = filenames
    self._compression_type = compression_type
    self._buffer_size = buffer_size
    self._num_parallel_reads = num_parallel_reads

    def read_one_file(filename):
      return _TFRecordDataset(filename, compression_type, buffer_size)

    if num_parallel_reads is None:
      self._impl = filenames.flat_map(read_one_file)
    else:
      self._impl = ParallelInterleaveDataset(
          filenames, read_one_file, cycle_length=num_parallel_reads,
          block_length=1, sloppy=False, buffer_output_elements=None,
          prefetch_input_elements=None)
    variant_tensor = self._impl._variant_tensor  # pylint: disable=protected-access
    super(TFRecordDatasetV2, self).__init__(variant_tensor)
Beispiel #40
0
  def __init__(self, selector_input, data_inputs, stop_on_empty_dataset=False):
    self._selector_input = selector_input
    self._data_inputs = list(data_inputs)
    self._stop_on_empty_dataset = stop_on_empty_dataset

    first_output_types = dataset_ops.get_legacy_output_types(data_inputs[0])
    first_output_classes = dataset_ops.get_legacy_output_classes(data_inputs[0])

    for i, data_input in enumerate(data_inputs[1:]):
      if (dataset_ops.get_legacy_output_types(data_input) != first_output_types
          or dataset_ops.get_legacy_output_classes(data_input) !=
          first_output_classes):
        raise TypeError("All datasets must have the same type and class.\n"
                        "dataset 0 vs dataset %s types: %s ; %s\n"
                        "classes: %s ; %s" %
                        (i + 1, first_output_types,
                         dataset_ops.get_legacy_output_types(data_input),
                         first_output_classes,
                         dataset_ops.get_legacy_output_classes(data_input)))

    output_shapes = dataset_ops.get_legacy_output_shapes(self._data_inputs[0])
    for data_input in self._data_inputs[1:]:
      output_shapes = nest.pack_sequence_as(output_shapes, [
          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
              nest.flatten(output_shapes),
              nest.flatten(dataset_ops.get_legacy_output_shapes(data_input)))
      ])
    self._element_spec = structure.convert_legacy_structure(
        first_output_types, output_shapes, first_output_classes)

    compat_kwargs = {}
    if compat.forward_compatible(2021, 5, 14) or self._stop_on_empty_dataset:
      compat_kwargs["stop_on_empty_dataset"] = self._stop_on_empty_dataset

    # pylint: disable=protected-access
    variant_tensor = (
        gen_experimental_dataset_ops.directed_interleave_dataset(
            self._selector_input._variant_tensor,
            [data_input._variant_tensor for data_input in self._data_inputs],
            **compat_kwargs, **self._flat_structure))

    super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
Beispiel #41
0
 def testUintInputs(self):
     elements = [(np.tile(np.array([[0], [1]], dtype=np.uint8), 2),
                  np.tile(np.array([[2], [256]], dtype=np.uint16), 2),
                  np.tile(np.array([[4], [65536]], dtype=np.uint32), 2),
                  np.tile(np.array([[8], [4294967296]], dtype=np.uint64),
                          2))]
     dataset = from_list.from_list(elements)
     self.assertEqual(
         (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64),
         dataset_ops.get_legacy_output_types(dataset))
     self.assertDatasetProduces(dataset, elements)
Beispiel #42
0
def print_info_data(dataset, print_example=True, n_example=3):
    # function to print data structure/shape about glue tensorflow dataset
    print('# Structure of the data:\n\n   {}'.format(dataset))
    print('\n# Output shape of one entry:\n   {}'.format(dataset_ops.get_legacy_output_shapes(dataset)))
    print('\n# Output types of one entry:\n   {}'.format(dataset_ops.get_legacy_output_types(dataset)))
    print('\n# Output typesof one entry:\n   {}'.format(dataset_ops.get_legacy_output_classes(dataset)))
    print(' \n')
    np_array = np.array(list(dataset.as_numpy_iterator()))
    print('# Shape of the data:\n\n   {}'.format(np.shape(np_array)))
    if len(np_array) > 0:
        if type(np_array[0]) is dict:
            structure = list(np_array[0].keys())
            print('   ---> {} entries'.format(np.shape(np_array)[0]))
            print('   ---> {} dim'.format(np_array.ndim))
            print('        dict structure')
            print('           dim: {}'.format(len(structure)))
            print('           [{:9} / {:9} / {:9}]'.format(structure[0], structure[1], structure[2]))

            print('           [{:9} / {:9} / {:9}]'.format(str(np.shape(np_array[0].get(structure[0]))),
                                                           str(np.shape(np_array[0].get(structure[1]))),
                                                           str(np.shape(np_array[0].get(structure[2])))))
            print('           [{:9} / {:9} / {:9}]'.format(type(np_array[0].get(structure[0])).__name__,
                                                           type(np_array[0].get(structure[1])).__name__,
                                                           type(np_array[0].get(structure[2])).__name__))

        if type(np_array[0]) is np.ndarray:
            if type(np_array[0][0]) is dict:
                structure = list(np_array[0][0].keys())
                print('   ---> {} batches'.format(np.shape(np_array)[0]))
                print('   ---> {} dim'.format(np_array.ndim))
                print('        label')
                print('           shape: {}'.format(np_array[0][1].shape))
                print('        dict structure')
                print('           dim: {}'.format(len(structure)))
                print('           [{:15} / {:15} / {:15}]'.format(structure[0], structure[1], structure[2]))
                print('           [{:15} / {:15} / {:15}]'.format(str(np_array[0][0].get(structure[0]).shape),
                                                                  str(np_array[0][0].get(structure[1]).shape),
                                                                  str(np_array[0][0].get(structure[2]).shape)))
                print('           [{:15} / {:15} / {:15}]'.format(type(np_array[0][0].get(structure[0])).__name__,
                                                                  type(np_array[0][0].get(structure[1])).__name__,
                                                                  type(np_array[0][0].get(structure[2])).__name__))
            else:
                print('   ---> {} entries'.format(np.shape(np_array)[0]))
                print('   ---> {} dim'.format(np_array.ndim))
                print('           [{:15} / {:15} ]'.format('text', 'label'))
                print('           [{:15} / {:15} ]'.format(str(np_array[0][0].shape), str(np_array[0][1].shape)))
                print('           [{:15} / {:15} ]'.format(str(np_array[0][0].dtype), str(np_array[0][1].dtype)))

    if print_example:
        print('\n\n# Examples of data:')
        for i, ex in enumerate(np_array):
            print('{}'.format(pprint.pformat(ex)))
            if i + 1 > n_example:
                break
Beispiel #43
0
    def testUnbatchScalarDataset(self):
        data = tuple([math_ops.range(10) for _ in range(3)])
        data = dataset_ops.Dataset.from_tensor_slices(data)
        expected_types = (dtypes.int32, ) * 3
        data = data.batch(2)
        self.assertEqual(expected_types,
                         dataset_ops.get_legacy_output_types(data))
        data = data.apply(batching.unbatch())
        self.assertEqual(expected_types,
                         dataset_ops.get_legacy_output_types(data))

        iterator = data.make_one_shot_iterator()
        op = iterator.get_next()

        with self.cached_session() as sess:
            for i in range(10):
                self.assertEqual((i, ) * 3, sess.run(op))

            with self.assertRaises(errors.OutOfRangeError):
                sess.run(op)
    def __init__(self, selector_input, data_inputs):
        self._selector_input = selector_input
        self._data_inputs = list(data_inputs)

        first_output_types = dataset_ops.get_legacy_output_types(
            data_inputs[0])
        first_output_classes = dataset_ops.get_legacy_output_classes(
            data_inputs[0])

        for i, data_input in enumerate(data_inputs[1:]):
            if (dataset_ops.get_legacy_output_types(data_input) !=
                    first_output_types
                    or dataset_ops.get_legacy_output_classes(data_input) !=
                    first_output_classes):
                raise TypeError(
                    f"All datasets must have the same type and class.\n"
                    f"dataset 0 types vs dataset {i+1} types: %s; %s\n"
                    f"dataset 0 classes vs dataset {i+1} classes: %s; %s" %
                    (first_output_types,
                     dataset_ops.get_legacy_output_types(data_input),
                     first_output_classes,
                     dataset_ops.get_legacy_output_classes(data_input)))

        output_shapes = dataset_ops.get_legacy_output_shapes(
            self._data_inputs[0])
        for data_input in self._data_inputs[1:]:
            output_shapes = nest.pack_sequence_as(output_shapes, [
                ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
                    nest.flatten(output_shapes),
                    nest.flatten(
                        dataset_ops.get_legacy_output_shapes(data_input)))
            ])

        self._element_spec = structure.convert_legacy_structure(
            first_output_types, output_shapes, first_output_classes)
        # pylint: disable=protected-access
        variant_tensor = gen_experimental_dataset_ops.directed_interleave_dataset(
            self._selector_input._variant_tensor,
            [data_input._variant_tensor for data_input in self._data_inputs],
            **self._flat_structure)
        super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
Beispiel #45
0
  def __init__(self, filenames, compression_type=None, buffer_size=None,
               num_parallel_reads=None):
    """Creates a `TFRecordDataset` to read one or more TFRecord files.

    NOTE: The `num_parallel_reads` argument can be used to improve performance
    when reading from a remote filesystem.

    Args:
      filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or
        more filenames.
      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
        bytes in the read buffer. 0 means no buffering.
      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
        number of files to read in parallel. Defaults to reading files
        sequentially.

    Raises:
      TypeError: If any argument does not have the expected type.
      ValueError: If any argument does not have the expected shape.
    """
    if isinstance(filenames, dataset_ops.DatasetV2):
      if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
        raise TypeError(
            "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
      if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
          tensor_shape.scalar()):
        raise ValueError(
            "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
            "elements.")
    else:
      filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
      filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
      filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)

    self._filenames = filenames
    self._compression_type = compression_type
    self._buffer_size = buffer_size
    self._num_parallel_reads = num_parallel_reads

    def read_one_file(filename):
      return _TFRecordDataset(filename, compression_type, buffer_size)

    if num_parallel_reads is None:
      self._impl = filenames.flat_map(read_one_file)
    else:
      self._impl = ParallelInterleaveDataset(
          filenames, read_one_file, cycle_length=num_parallel_reads,
          block_length=1, sloppy=False, buffer_output_elements=None,
          prefetch_input_elements=None)
    variant_tensor = self._impl._variant_tensor  # pylint: disable=protected-access
    super(TFRecordDatasetV2, self).__init__(variant_tensor)
Beispiel #46
0
 def __init__(self, input_dataset):
     """See `unique()` for details."""
     self._input_dataset = input_dataset
     if dataset_ops.get_legacy_output_types(input_dataset) not in (
             dtypes.int32, dtypes.int64, dtypes.string):
         raise TypeError(
             "`tf.data.experimental.unique()` only supports inputs with a single "
             "`tf.int32`, `tf.int64`, or `tf.string` component.")
     variant_tensor = gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
     super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
Beispiel #47
0
  def __init__(self, input_dataset, batch_size, row_shape):
    """See `Dataset.dense_to_sparse_batch()` for more details."""
    if not isinstance(
        dataset_ops.get_legacy_output_types(input_dataset), dtypes.DType):
      raise TypeError("DenseToSparseDataset requires an input whose elements "
                      "have a single component, whereas the input has %r." %
                      dataset_ops.get_legacy_output_types(input_dataset))
    self._input_dataset = input_dataset
    self._batch_size = batch_size
    self._row_shape = row_shape
    self._structure = structure.SparseTensorStructure(
        dataset_ops.get_legacy_output_types(input_dataset),
        tensor_shape.vector(None).concatenate(self._row_shape))

    variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
        self._batch_size,
        row_shape=convert.partial_shape_to_tensor(self._row_shape),
        **dataset_ops.flat_structure(self))
    super(_DenseToSparseBatchDataset, self).__init__(input_dataset,
                                                     variant_tensor)
Beispiel #48
0
    def _next_func(string_handle):
      """Calls get_next for created iterator.

      Args:
        string_handle: An iterator string handle created by _init_func
      Returns:
        The elements generated from `input_dataset`
      """
      with ops.device(self._source_device_string):
        iterator = iterator_ops.Iterator.from_string_handle(
            string_handle,
            dataset_ops.get_legacy_output_types(self),
            dataset_ops.get_legacy_output_shapes(self),
            dataset_ops.get_legacy_output_classes(self))
      return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access
Beispiel #49
0
  def __init__(self, selector_input, data_inputs):
    self._selector_input = selector_input
    self._data_inputs = list(data_inputs)

    first_output_types = dataset_ops.get_legacy_output_types(data_inputs[0])
    first_output_classes = dataset_ops.get_legacy_output_classes(data_inputs[0])

    for data_input in data_inputs[1:]:
      if (dataset_ops.get_legacy_output_types(data_input) != first_output_types
          or dataset_ops.get_legacy_output_classes(data_input)
          != first_output_classes):
        raise TypeError("All datasets must have the same type and class.")

    output_shapes = dataset_ops.get_legacy_output_shapes(self._data_inputs[0])
    for data_input in self._data_inputs[1:]:
      output_shapes = nest.pack_sequence_as(output_shapes, [
          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
              nest.flatten(output_shapes),
              nest.flatten(dataset_ops.get_legacy_output_shapes(data_input)))
      ])

    self._structure = structure.convert_legacy_structure(
        first_output_types, output_shapes, first_output_classes)
    super(_DirectedInterleaveDataset, self).__init__()
Beispiel #50
0
  def testEnumerate(self):
    components = (["a", "b"], [1, 2], [37.0, 38])
    start = constant_op.constant(20, dtype=dtypes.int64)

    dataset = dataset_ops.Dataset.from_tensor_slices(components).enumerate(
        start)

    self.assertEqual(dtypes.int64,
                     dataset_ops.get_legacy_output_types(dataset)[0])
    dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
    self.assertEqual((), dataset_output_shapes[0])
    self.assertEqual([tensor_shape.TensorShape([])] * 3,
                     [shape for shape in dataset_output_shapes[1]])

    self.assertDatasetProduces(dataset, [(20, (b"a", 1, 37.0)),
                                         (21, (b"b", 2, 38.0))])
Beispiel #51
0
  def testCounter(self):
    """Test dataset construction using `count`."""
    dataset = counter.Counter(start=3, step=4)
    self.assertEqual(
        [], dataset_ops.get_legacy_output_shapes(dataset).as_list())
    self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset))
    get_next = self.getNext(dataset)

    negative_dataset = counter.Counter(start=0, step=-1)
    negative_get_next = self.getNext(negative_dataset)

    self.assertEqual(3, self.evaluate(get_next()))
    self.assertEqual(3 + 4, self.evaluate(get_next()))
    self.assertEqual(3 + 2 * 4, self.evaluate(get_next()))

    self.assertEqual(0, self.evaluate(negative_get_next()))
    self.assertEqual(-1, self.evaluate(negative_get_next()))
    self.assertEqual(-2, self.evaluate(negative_get_next()))
Beispiel #52
0
  def testIteratorStructure(self, tf_value_fn, expected_element_structure,
                            expected_output_classes, expected_output_types,
                            expected_output_shapes):
    tf_value = tf_value_fn()
    iterator = dataset_ops.make_one_shot_iterator(
        dataset_ops.Dataset.from_tensors(tf_value))

    self.assertTrue(expected_element_structure.is_compatible_with(
        iterator._element_structure))
    self.assertTrue(iterator._element_structure.is_compatible_with(
        expected_element_structure))

    self.assertEqual(expected_output_classes,
                     dataset_ops.get_legacy_output_classes(iterator))
    self.assertEqual(expected_output_types,
                     dataset_ops.get_legacy_output_types(iterator))
    self.assertEqual(expected_output_shapes,
                     dataset_ops.get_legacy_output_shapes(iterator))
Beispiel #53
0
 def MapFn(unused_input):
   source_dataset_output_types = dataset_ops.get_legacy_output_types(
       source_dataset)
   if isinstance(source_dataset_output_types, dtypes.DType):
     output_types = [source_dataset_output_types]
   elif isinstance(source_dataset_output_types, (list, tuple)):
     output_types = source_dataset_output_types
   else:
     raise ValueError('source dataset has invalid output types')
   remote_calls = functional_ops.remote_call(
       args=[source_handle],
       Tout=output_types,
       f=LoadingFunc,
       target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
   if len(remote_calls) == 1:
     return remote_calls[0]
   else:
     return remote_calls
  def testRemoteIteratorWithoutRemoteCallFail(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
      iterator_3_handle = iterator_3.string_handle()

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      remote_it = iterator_ops.Iterator.from_string_handle(
          iterator_3_handle, dataset_ops.get_legacy_output_types(dataset_3),
          dataset_ops.get_legacy_output_shapes(dataset_3))
      get_next_op = remote_it.get_next()

    with session.Session(worker[0].target) as sess:
      with self.assertRaises(errors.InvalidArgumentError):
        sess.run(get_next_op)
Beispiel #55
0
  def write(self, dataset):
    """Returns a `tf.Operation` to write a dataset to a file.

    Args:
      dataset: a `tf.data.Dataset` whose elements are to be written to a file

    Returns:
      A `tf.Operation` that, when run, writes contents of `dataset` to a file.
    """
    if not isinstance(dataset, dataset_ops.DatasetV2):
      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
    if not dataset_ops.get_structure(dataset).is_compatible_with(
        structure.TensorStructure(dtypes.string, [])):
      raise TypeError(
          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
          "produces shape {0} and types {1}".format(
              dataset_ops.get_legacy_output_shapes(dataset),
              dataset_ops.get_legacy_output_types(dataset)))
    return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
        dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
Beispiel #56
0
  def testKinesisDatasetOneShard(self):
    client = boto3.client('kinesis', region_name='us-east-1')

    # Setup the Kinesis with 1 shard.
    stream_name = "tf_kinesis_test_1"
    client.create_stream(StreamName=stream_name, ShardCount=1)
    # Wait until stream exists, default is 10 * 18 seconds.
    client.get_waiter('stream_exists').wait(StreamName=stream_name)
    for i in range(10):
      data = "D" + str(i)
      client.put_record(
          StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i))

    stream = array_ops.placeholder(dtypes.string, shape=[])
    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
    batch_size = array_ops.placeholder(dtypes.int64, shape=[])

    repeat_dataset = kinesis_dataset_ops.KinesisDataset(
        stream, read_indefinitely=False).repeat(num_epochs)
    batch_dataset = repeat_dataset.batch(batch_size)

    iterator = iterator_ops.Iterator.from_structure(
        dataset_ops.get_legacy_output_types(batch_dataset))
    init_op = iterator.make_initializer(repeat_dataset)
    init_batch_op = iterator.make_initializer(batch_dataset)
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      # Basic test: read from shard 0 of stream 1.
      sess.run(init_op, feed_dict={stream: stream_name, num_epochs: 1})
      for i in range(10):
        self.assertEqual("D" + str(i), sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)

    client.delete_stream(StreamName=stream_name)
    # Wait until stream deleted, default is 10 * 18 seconds.
    client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
  def testRestructureDataset(self):
    components = (array_ops.placeholder(dtypes.int32),
                  (array_ops.placeholder(dtypes.int32, shape=[None]),
                   array_ops.placeholder(dtypes.int32, shape=[20, 30])))
    dataset = dataset_ops.Dataset.from_tensors(components)

    i32 = dtypes.int32

    test_cases = [((i32, i32, i32), None),
                  (((i32, i32), i32), None),
                  ((i32, i32, i32), (None, None, None)),
                  ((i32, i32, i32), ([17], [17], [20, 30]))]

    for new_types, new_shape_lists in test_cases:
      # pylint: disable=protected-access
      new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
      # pylint: enable=protected-access
      self.assertEqual(new_types, dataset_ops.get_legacy_output_types(new))
      if new_shape_lists is not None:
        for expected_shape_list, shape in zip(
            nest.flatten(new_shape_lists),
            nest.flatten(dataset_ops.get_legacy_output_shapes(new))):
          if expected_shape_list is None:
            self.assertIs(None, shape.ndims)
          else:
            self.assertEqual(expected_shape_list, shape.as_list())

    fail_cases = [((i32, dtypes.int64, i32), None),
                  ((i32, i32, i32, i32), None),
                  ((i32, i32, i32), ((None, None), None)),
                  ((i32, i32, i32), (None, None, None, None)),
                  ((i32, i32, i32), (None, [None], [21, 30]))]

    for new_types, new_shape_lists in fail_cases:
      with self.assertRaises(ValueError):
        # pylint: disable=protected-access
        new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
Beispiel #58
0
  def __init__(self, input_dataset):
    """See `unbatch()` for more details."""
    input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
    flat_shapes = nest.flatten(input_shapes)
    if any(s.ndims == 0 for s in flat_shapes):
      raise ValueError("Cannot unbatch an input with scalar components.")
    known_batch_dim = tensor_shape.Dimension(None)
    for s in flat_shapes:
      try:
        known_batch_dim = known_batch_dim.merge_with(s[0])
      except ValueError:
        raise ValueError("Cannot unbatch an input whose components have "
                         "different batch sizes.")
    self._input_dataset = input_dataset

    self._structure = structure.convert_legacy_structure(
        dataset_ops.get_legacy_output_types(input_dataset),
        nest.map_structure(lambda s: s[1:], input_shapes),
        dataset_ops.get_legacy_output_classes(input_dataset))

    variant_tensor = ged_ops.experimental_unbatch_dataset(
        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
        **dataset_ops.flat_structure(self))
    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
Beispiel #59
0
 def _remote_fn(h):
   handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
   remote_iterator = iterator_ops.Iterator.from_string_handle(
       handle, dataset_ops.get_legacy_output_types(dataset_3),
       dataset_ops.get_legacy_output_shapes(dataset_3))
   return remote_iterator.get_next()
Beispiel #60
0
 def loading_func(h):
   remote_itr = iterator_ops.Iterator.from_string_handle(
       h, dataset_ops.get_legacy_output_types(itr),
       dataset_ops.get_legacy_output_shapes(itr))
   return remote_itr.get_next()