def testNonSequenceNestedStructure(self): components = np.array([1, 2, 3], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensors(components) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset)) dataset = dataset.filter( lambda x: math_ops.reduce_all(math_ops.equal(x, components))) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset)) dataset = dataset.map(lambda x: array_ops.stack([x, x])) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) self.assertEqual([2, 3], dataset_ops.get_legacy_output_shapes(dataset)) dataset = dataset.flat_map( lambda x: dataset_ops.Dataset.from_tensor_slices(x)) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset)) get_next = self.getNext(dataset) self.assertEqual(dtypes.int64, get_next().dtype) self.assertEqual([3], get_next().shape)
def testUnbatchScalarDataset(self): data = tuple([math_ops.range(10) for _ in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) expected_types = (dtypes.int32,) * 3 data = data.batch(2) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.apply(batching.unbatch()) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
def testUnbatchDatasetWithStrings(self): data = tuple([math_ops.range(10) for _ in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z)) expected_types = (dtypes.int32, dtypes.string, dtypes.int32) data = data.batch(2) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.apply(batching.unbatch()) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) self.assertDatasetProduces( data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
def testNestedDict(self): components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]} dataset = dataset_ops.Dataset.from_tensors(components) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["a"]["aa"]) self.assertEqual(dtypes.float32, dataset_ops.get_legacy_output_types(dataset)["a"]["ab"]) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["b"]) self.assertEqual([], dataset_ops.get_legacy_output_shapes(dataset)["a"]["aa"]) self.assertEqual([2], dataset_ops.get_legacy_output_shapes(dataset)["a"]["ab"]) self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset)["b"])
def testUnbatchMultiElementTupleDataset(self): data = tuple([(math_ops.range(10 * i, 10 * i + 10), array_ops.fill([10], "hi")) for i in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) expected_types = ((dtypes.int32, dtypes.string),) * 3 data = data.batch(2) self.assertAllEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.apply(batching.unbatch()) self.assertAllEqual(expected_types, dataset_ops.get_legacy_output_types(data)) self.assertDatasetProduces( data, [((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")) for i in range(10)])
def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" # NOTE(mrry): We must ensure that any SparseTensors in `dataset` # are normalized to the rank-1 dense representation, so that the # sparse-oblivious unbatching logic will slice them # appropriately. This leads to a somewhat inefficient re-encoding step # for all SparseTensor components. # TODO(mrry): Consider optimizing this in future if it turns out to be # a bottleneck. def normalize(arg, *rest): # pylint: disable=protected-access if rest: return dataset._element_structure._to_batched_tensor_list( (arg, ) + rest) else: return dataset._element_structure._to_batched_tensor_list(arg) normalized_dataset = dataset.map(normalize) # NOTE(mrry): Our `map()` has lost information about the sparseness # of any SparseTensor components, so re-apply the structure of the # original dataset. restructured_dataset = _RestructuredDataset( normalized_dataset, dataset_ops.get_legacy_output_types(dataset), dataset_ops.get_legacy_output_shapes(dataset), dataset_ops.get_legacy_output_classes(dataset), allow_unsafe_cast=True) return _UnbatchDataset(restructured_dataset)
def testIteratorStringHandleReuseTensorObject(self): dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset) initializable_iterator = dataset_ops.make_initializable_iterator( dataset) structure_iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(dataset)) created_ops = len(ops.get_default_graph().get_operations()) self.assertIs(one_shot_iterator.string_handle(), one_shot_iterator.string_handle()) self.assertIs(initializable_iterator.string_handle(), initializable_iterator.string_handle()) self.assertIs(structure_iterator.string_handle(), structure_iterator.string_handle()) # Assert that getting the (default) string handle creates no ops. self.assertEqual(created_ops, len(ops.get_default_graph().get_operations())) # Specifying an explicit name will create a new op. handle_with_name = one_shot_iterator.string_handle(name="foo") self.assertEqual("foo", handle_with_name.op.name) self.assertIsNot(one_shot_iterator.string_handle(), handle_with_name) handle_with_same_name = one_shot_iterator.string_handle(name="foo") self.assertEqual("foo_1", handle_with_same_name.op.name) self.assertIsNot(handle_with_name, handle_with_same_name)
def testUnbatchMultiElementTupleDataset(self): data = tuple([(math_ops.range(10 * i, 10 * i + 10), array_ops.fill([10], "hi")) for i in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) expected_types = ((dtypes.int32, dtypes.string), ) * 3 data = data.batch(2) self.assertAllEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.unbatch() self.assertAllEqual(expected_types, dataset_ops.get_legacy_output_types(data)) self.assertDatasetProduces(data, [((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")) for i in range(10)])
def assertDatasetsEqual(self, dataset1, dataset2): """Checks that datasets are equal. Supports both graph and eager mode.""" self.assertTrue( structure.are_compatible(dataset_ops.get_structure(dataset1), dataset_ops.get_structure(dataset2))) flattened_types = nest.flatten( dataset_ops.get_legacy_output_types(dataset1)) next1 = self.getNext(dataset1) next2 = self.getNext(dataset2) while True: try: op1 = self.evaluate(next1()) except errors.OutOfRangeError: with self.assertRaises(errors.OutOfRangeError): self.evaluate(next2()) break op2 = self.evaluate(next2()) op1 = nest.flatten(op1) op2 = nest.flatten(op2) assert len(op1) == len(op2) for i in range(len(op1)): if sparse_tensor.is_sparse(op1[i]): self.assertSparseValuesEqual(op1[i], op2[i]) elif ragged_tensor.is_ragged(op1[i]): self.assertAllEqual(op1[i], op2[i]) elif flattened_types[i] == dtypes.string: self.assertAllEqual(op1[i], op2[i]) else: self.assertAllClose(op1[i], op2[i])
def _create_or_validate_filenames_dataset(filenames, name=None): """Creates (or validates) a dataset of filenames. Args: filenames: Either a list or dataset of filenames. If it is a list, it is convert to a dataset. If it is a dataset, its type and shape is validated. name: (Optional.) A name for the tf.data operation. Returns: A dataset of filenames. """ if isinstance(filenames, dataset_ops.DatasetV2): element_type = dataset_ops.get_legacy_output_types(filenames) if element_type != dtypes.string: raise TypeError( "The `filenames` argument must contain `tf.string` elements. Got a " f"dataset of `{element_type!r}` elements.") element_shape = dataset_ops.get_legacy_output_shapes(filenames) if not element_shape.is_compatible_with(tensor_shape.TensorShape([])): raise TypeError( "The `filenames` argument must contain `tf.string` elements of shape " "[] (i.e. scalars). Got a dataset of element shape " f"{element_shape!r}.") else: filenames = nest.map_structure(_normalise_fspath, filenames) filenames = ops.convert_to_tensor(filenames, dtype_hint=dtypes.string) if filenames.dtype != dtypes.string: raise TypeError( "The `filenames` argument must contain `tf.string` elements. Got " f"`{filenames.dtype!r}` elements.") filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.TensorSliceDataset(filenames, is_files=True, name=name) return filenames
def _create_or_validate_filenames_dataset(filenames): """Creates (or validates) a dataset of filenames. Args: filenames: Either a list or dataset of filenames. If it is a list, it is convert to a dataset. If it is a dataset, its type and shape is validated. Returns: A dataset of filenames. """ if isinstance(filenames, dataset_ops.DatasetV2): if dataset_ops.get_legacy_output_types(filenames) != dtypes.string: raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.") if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with( tensor_shape.scalar()): raise TypeError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") else: filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string) filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames) return filenames
def assertDatasetsEqual(self, dataset1, dataset2): """Checks that datasets are equal. Supports both graph and eager mode.""" self.assertTrue(dataset_ops.get_structure(dataset1).is_compatible_with( dataset_ops.get_structure(dataset2))) self.assertTrue(dataset_ops.get_structure(dataset2).is_compatible_with( dataset_ops.get_structure(dataset1))) flattened_types = nest.flatten( dataset_ops.get_legacy_output_types(dataset1)) next1 = self.getNext(dataset1) next2 = self.getNext(dataset2) while True: try: op1 = self.evaluate(next1()) except errors.OutOfRangeError: with self.assertRaises(errors.OutOfRangeError): self.evaluate(next2()) break op2 = self.evaluate(next2()) op1 = nest.flatten(op1) op2 = nest.flatten(op2) assert len(op1) == len(op2) for i in range(len(op1)): if sparse_tensor.is_sparse(op1[i]): self.assertSparseValuesEqual(op1[i], op2[i]) elif flattened_types[i] == dtypes.string: self.assertAllEqual(op1[i], op2[i]) else: self.assertAllClose(op1[i], op2[i])
def __init__(self, input_dataset): """See `unbatch()` for more details.""" input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset) flat_shapes = nest.flatten(input_shapes) if any(s.ndims == 0 for s in flat_shapes): raise ValueError("Cannot unbatch an input with scalar components.") known_batch_dim = tensor_shape.Dimension(None) for s in flat_shapes: try: known_batch_dim = known_batch_dim.merge_with(s[0]) except ValueError: raise ValueError( "Cannot unbatch an input whose components have " "different batch sizes.") self._input_dataset = input_dataset self._structure = structure.convert_legacy_structure( dataset_ops.get_legacy_output_types(input_dataset), nest.map_structure(lambda s: s[1:], input_shapes), dataset_ops.get_legacy_output_classes(input_dataset)) variant_tensor = ged_ops.experimental_unbatch_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access **dataset_ops.flat_structure(self)) super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" # NOTE(mrry): We must ensure that any SparseTensors in `dataset` # are normalized to the rank-1 dense representation, so that the # sparse-oblivious unbatching logic will slice them # appropriately. This leads to a somewhat inefficient re-encoding step # for all SparseTensor components. # TODO(mrry): Consider optimizing this in future if it turns out to be # a bottleneck. def normalize(arg, *rest): # pylint: disable=protected-access if rest: return dataset._element_structure._to_batched_tensor_list((arg,) + rest) else: return dataset._element_structure._to_batched_tensor_list(arg) normalized_dataset = dataset.map(normalize) # NOTE(mrry): Our `map()` has lost information about the sparseness # of any SparseTensor components, so re-apply the structure of the # original dataset. restructured_dataset = _RestructuredDataset( normalized_dataset, dataset_ops.get_legacy_output_types(dataset), dataset_ops.get_legacy_output_shapes(dataset), dataset_ops.get_legacy_output_classes(dataset), allow_unsafe_cast=True) return _UnbatchDataset(restructured_dataset)
def __init__(self, input_dataset, num_workers): self._input_dataset = input_dataset def recalculate_output_shapes(output_shapes): """Recalculates the output_shapes after dividing it by num_workers.""" if len(output_shapes) < 1: raise ValueError("Input shape should have at least one dimension.") if (tensor_shape.dimension_value(output_shapes[0]) and tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0): raise errors.InvalidArgumentError( None, None, "First dim of input shape: %d is not divisible by num_workers: %d" % (output_shapes[0], num_workers)) output_dims = [d for d in output_shapes.dims] output_dims[0] = output_dims[0] // num_workers return tensor_shape.TensorShape(output_dims) input_types = dataset_ops.get_legacy_output_types(self._input_dataset) input_shapes = dataset_ops.get_legacy_output_shapes(self._input_dataset) input_classes = dataset_ops.get_legacy_output_classes(self._input_dataset) output_shapes = nest.map_structure(recalculate_output_shapes, input_shapes) self._structure = structure.convert_legacy_structure( input_types, output_shapes, input_classes) variant_tensor = ged_ops.experimental_rebatch_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access num_workers=num_workers, **dataset_ops.flat_structure(self)) super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
def _create_or_validate_filenames_dataset(filenames): """Creates (or validates) a dataset of filenames. Args: filenames: Either a list or dataset of filenames. If it is a list, it is convert to a dataset. If it is a dataset, its type and shape is validated. Returns: A dataset of filenames. """ if isinstance(filenames, dataset_ops.DatasetV2): if dataset_ops.get_legacy_output_types(filenames) != dtypes.string: raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.") if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with( tensor_shape.TensorShape([])): raise TypeError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") else: filenames = ops.convert_to_tensor(filenames, dtype_hint=dtypes.string) if filenames.dtype != dtypes.string: raise TypeError( "`filenames` must be a `tf.Tensor` of dtype `tf.string` dtype." " Got {}".format(filenames.dtype)) filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames) return filenames
def write(self, dataset, column_families, columns, timestamp=None): """Writes a dataset to the table. Args: dataset: A `tf.data.Dataset` to be written to this table. It must produce a list of number-of-columns+1 elements, all of which must be strings. The first value will be used as the row key, and subsequent values will be used as cell values for the corresponding columns from the corresponding column_families and columns entries. column_families: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. columns: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. timestamp: (Optional.) An int64 timestamp to write all the values at. Leave as None to use server-provided timestamps. Returns: A `tf.Operation` that can be run to perform the write. Raises: ValueError: If there are unexpected or incompatible types, or if the number of columns and column_families does not match the output of `dataset`. """ if timestamp is None: timestamp = -1 # Bigtable server provided timestamp. for tensor_type in nest.flatten( dataset_ops.get_legacy_output_types(dataset)): if tensor_type != dtypes.string: raise ValueError("Not all elements of the dataset were `tf.string`") for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)): if not shape.is_compatible_with(tensor_shape.scalar()): raise ValueError("Not all elements of the dataset were scalars") if len(column_families) != len(columns): raise ValueError("len(column_families) != len(columns)") if len(nest.flatten( dataset_ops.get_legacy_output_types(dataset))) != len(columns) + 1: raise ValueError("A column name must be specified for every component of " "the dataset elements. (e.g.: len(columns) != " "len(dataset.output_types))") return gen_bigtable_ops.dataset_to_bigtable( self._resource, dataset._variant_tensor, # pylint: disable=protected-access column_families, columns, timestamp)
def testNegativeStep(self, output_type): start, stop, step = 2, 10, -1 dataset = dataset_ops.Dataset.range( start, stop, step, output_type=output_type) expected_output = np.arange( start, stop, step, dtype=output_type.as_numpy_dtype) self.assertDatasetProduces(dataset, expected_output=expected_output) self.assertEqual(output_type, dataset_ops.get_legacy_output_types(dataset))
def testUnbatchDatasetWithUintDtypes(self): components = ( np.tile(np.array([[0], [1], [2], [3]], dtype=np.uint8), 2), np.tile(np.array([[1], [2], [3], [256]], dtype=np.uint16), 2), np.tile(np.array([[2], [3], [4], [65536]], dtype=np.uint32), 2), np.tile(np.array([[3], [4], [5], [4294967296]], dtype=np.uint64), 2), ) expected_types = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64) expected_output = [tuple([c[i] for c in components]) for i in range(4)] data = dataset_ops.Dataset.from_tensor_slices(components) data = data.batch(2) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.unbatch() self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) self.assertDatasetProduces(data, expected_output)
def testStopLessThanStartWithPositiveStep(self, output_type): start, stop, step = 10, 2, 2 dataset = dataset_ops.Dataset.range( start, stop, step, output_type=output_type) expected_output = np.arange( start, stop, step, dtype=output_type.as_numpy_dtype) self.assertDatasetProduces(dataset, expected_output=expected_output) self.assertEqual(output_type, dataset_ops.get_legacy_output_types(dataset))
def write(self, dataset, column_families, columns, timestamp=None): """Writes a dataset to the table. Args: dataset: A `tf.data.Dataset` to be written to this table. It must produce a list of number-of-columns+1 elements, all of which must be strings. The first value will be used as the row key, and subsequent values will be used as cell values for the corresponding columns from the corresponding column_families and columns entries. column_families: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. columns: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. timestamp: (Optional.) An int64 timestamp to write all the values at. Leave as None to use server-provided timestamps. Returns: A `tf.Operation` that can be run to perform the write. Raises: ValueError: If there are unexpected or incompatible types, or if the number of columns and column_families does not match the output of `dataset`. """ if timestamp is None: timestamp = -1 # Bigtable server provided timestamp. for tensor_type in nest.flatten( dataset_ops.get_legacy_output_types(dataset)): if tensor_type != dtypes.string: raise ValueError("Not all elements of the dataset were `tf.string`") for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)): if not shape.is_compatible_with(tensor_shape.scalar()): raise ValueError("Not all elements of the dataset were scalars") if len(column_families) != len(columns): raise ValueError("len(column_families) != len(columns)") if len(nest.flatten( dataset_ops.get_legacy_output_types(dataset))) != len(columns) + 1: raise ValueError("A column name must be specified for every component of " "the dataset elements. (e.g.: len(columns) != " "len(dataset.output_types))") return gen_bigtable_ops.dataset_to_bigtable( self._resource, dataset._variant_tensor, # pylint: disable=protected-access column_families, columns, timestamp)
def testKinesisDatasetTwoShards(self): client = boto3.client('kinesis', region_name='us-east-1') # Setup the Kinesis with 2 shards. stream_name = "tf_kinesis_test_2" client.create_stream(StreamName=stream_name, ShardCount=2) # Wait until stream exists, default is 10 * 18 seconds. client.get_waiter('stream_exists').wait(StreamName=stream_name) for i in range(10): data = "D" + str(i) client.put_record( StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i)) response = client.describe_stream(StreamName=stream_name) shard_id_0 = response["StreamDescription"]["Shards"][0]["ShardId"] shard_id_1 = response["StreamDescription"]["Shards"][1]["ShardId"] stream = array_ops.placeholder(dtypes.string, shape=[]) shard = array_ops.placeholder(dtypes.string, shape=[]) num_epochs = array_ops.placeholder(dtypes.int64, shape=[]) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) repeat_dataset = kinesis_dataset_ops.KinesisDataset( stream, shard, read_indefinitely=False).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(batch_dataset)) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() data = list() with self.cached_session() as sess: # Basic test: read from shard 0 of stream 2. sess.run( init_op, feed_dict={ stream: stream_name, shard: shard_id_0, num_epochs: 1}) with self.assertRaises(errors.OutOfRangeError): # Use range(11) to guarantee the OutOfRangeError. for i in range(11): data.append(sess.run(get_next)) # Basic test: read from shard 1 of stream 2. sess.run( init_op, feed_dict={ stream: stream_name, shard: shard_id_1, num_epochs: 1}) with self.assertRaises(errors.OutOfRangeError): # Use range(11) to guarantee the OutOfRangeError. for i in range(11): data.append(sess.run(get_next)) data.sort() self.assertEqual(data, ["D" + str(i) for i in range(10)]) client.delete_stream(StreamName=stream_name) # Wait until stream deleted, default is 10 * 18 seconds. client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
def create_tf_dataset(file_pattern, spec, num_epochs=1, shuffle=False, shuffle_seed=None, shuffle_buffer_size=None, reader_num_threads=None, parser_num_threads=None, prefetch_buffer_size=None): reader = _gzip_reader_fn if reader_num_threads is None: reader_num_threads = 1 if parser_num_threads is None: parser_num_threads = 2 if prefetch_buffer_size is None: prefetch_buffer_size = dataset_ops.AUTOTUNE # Create dataset of all matching filenames dataset = dataset_ops.Dataset.list_files(file_pattern=file_pattern, shuffle=shuffle, seed=shuffle_seed) if reader_num_threads == dataset_ops.AUTOTUNE: dataset = dataset.interleave(lambda filename: reader(filename), num_parallel_calls=reader_num_threads) options = dataset_ops.Options() options.experimental_deterministic = True dataset = dataset.with_options(options) else: def apply_fn(dataset): return core_readers.ParallelInterleaveDataset( dataset, lambda filename: reader(filename), cycle_length=reader_num_threads, block_length=1, sloppy=True, buffer_output_elements=None, prefetch_input_elements=None) dataset = dataset.apply(apply_fn) if dataset_ops.get_legacy_output_types(dataset) == (dtypes.string, dtypes.string): dataset = dataset_ops.MapDataset(dataset, lambda _, v: v, use_inter_op_parallelism=False) if shuffle: dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed) if num_epochs != 1: dataset = dataset.repeat(num_epochs) dataset = dataset.map(lambda x: tf.io.parse_example(x, spec)) dataset = dataset.map(_split_inputs_labels) dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def testKinesisDatasetTwoShards(self): client = boto3.client('kinesis', region_name='us-east-1') # Setup the Kinesis with 2 shards. stream_name = "tf_kinesis_test_2" client.create_stream(StreamName=stream_name, ShardCount=2) # Wait until stream exists, default is 10 * 18 seconds. client.get_waiter('stream_exists').wait(StreamName=stream_name) for i in range(10): data = "D" + str(i) client.put_record( StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i)) response = client.describe_stream(StreamName=stream_name) shard_id_0 = response["StreamDescription"]["Shards"][0]["ShardId"] shard_id_1 = response["StreamDescription"]["Shards"][1]["ShardId"] stream = array_ops.placeholder(dtypes.string, shape=[]) shard = array_ops.placeholder(dtypes.string, shape=[]) num_epochs = array_ops.placeholder(dtypes.int64, shape=[]) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) repeat_dataset = kinesis_dataset_ops.KinesisDataset( stream, shard, read_indefinitely=False).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(batch_dataset)) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() data = [] with self.cached_session() as sess: # Basic test: read from shard 0 of stream 2. sess.run( init_op, feed_dict={ stream: stream_name, shard: shard_id_0, num_epochs: 1}) with self.assertRaises(errors.OutOfRangeError): # Use range(11) to guarantee the OutOfRangeError. for i in range(11): data.append(sess.run(get_next)) # Basic test: read from shard 1 of stream 2. sess.run( init_op, feed_dict={ stream: stream_name, shard: shard_id_1, num_epochs: 1}) with self.assertRaises(errors.OutOfRangeError): # Use range(11) to guarantee the OutOfRangeError. for i in range(11): data.append(sess.run(get_next)) data.sort() self.assertEqual(data, ["D" + str(i) for i in range(10)]) client.delete_stream(StreamName=stream_name) # Wait until stream deleted, default is 10 * 18 seconds. client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
def testIteratorStringHandle(self): dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40]) iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3) iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4) handle_placeholder = array_ops.placeholder(dtypes.string, shape=[]) feedable_iterator = iterator_ops.Iterator.from_string_handle( handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3), dataset_ops.get_legacy_output_shapes(dataset_3)) next_element = feedable_iterator.get_next() self.assertTrue( dataset_ops.get_structure(dataset_3).is_compatible_with( dataset_ops.get_structure(feedable_iterator))) self.assertTrue( dataset_ops.get_structure(dataset_4).is_compatible_with( dataset_ops.get_structure(feedable_iterator))) with self.cached_session() as sess: iterator_3_handle = sess.run(iterator_3.string_handle()) iterator_4_handle = sess.run(iterator_4.string_handle()) self.assertEqual( 10, sess.run(next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual( 1, sess.run(next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual( 20, sess.run(next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual( 2, sess.run(next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual( 30, sess.run(next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual( 3, sess.run(next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual( 40, sess.run(next_element, feed_dict={handle_placeholder: iterator_4_handle})) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element, feed_dict={handle_placeholder: iterator_3_handle}) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element, feed_dict={handle_placeholder: iterator_4_handle})
def __init__(self, filenames, compression_type=None, buffer_size=None, num_parallel_reads=None): """Creates a `TFRecordDataset` to read one or more TFRecord files. NOTE: The `num_parallel_reads` argument can be used to improve performance when reading from a remote filesystem. Args: filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or more filenames. compression_type: (Optional.) A `tf.string` scalar evaluating to one of `""` (no compression), `"ZLIB"`, or `"GZIP"`. buffer_size: (Optional.) A `tf.int64` scalar representing the number of bytes in the read buffer. 0 means no buffering. num_parallel_reads: (Optional.) A `tf.int64` scalar representing the number of files to read in parallel. Defaults to reading files sequentially. Raises: TypeError: If any argument does not have the expected type. ValueError: If any argument does not have the expected shape. """ if isinstance(filenames, dataset_ops.DatasetV2): if dataset_ops.get_legacy_output_types(filenames) != dtypes.string: raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements." ) if not dataset_ops.get_legacy_output_shapes( filenames).is_compatible_with(tensor_shape.scalar()): raise ValueError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") else: filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string) filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames) self._filenames = filenames self._compression_type = compression_type self._buffer_size = buffer_size self._num_parallel_reads = num_parallel_reads def read_one_file(filename): return _TFRecordDataset(filename, compression_type, buffer_size) if num_parallel_reads is None: self._impl = filenames.flat_map(read_one_file) else: self._impl = filenames.interleave( read_one_file, cycle_length=num_parallel_reads, num_parallel_calls=num_parallel_reads) variant_tensor = self._impl._variant_tensor # pylint: disable=protected-access super(TFRecordDatasetV2, self).__init__(variant_tensor)
def batch_init_fn(_): indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]], 0) return sparse_tensor.SparseTensor( indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64), values=constant_op.constant( [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)), dense_shape=array_ops.concat( [np.array([0], dtype=np.int64), padded_shape], 0))
def batch_init_fn(_): indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]], 0) return sparse_tensor.SparseTensor( indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64), values=constant_op.constant( [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)), dense_shape=array_ops.concat( [np.array([0], dtype=np.int64), padded_shape], 0))
def testFromTensorSlicesWithDict(self): components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]} dataset = dataset_ops.Dataset.from_tensor_slices(components) get_next = self.getNext(dataset) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["foo"]) self.assertEqual(dtypes.float32, dataset_ops.get_legacy_output_types(dataset)["bar"]) self.assertEqual((), dataset_ops.get_legacy_output_shapes(dataset)["foo"]) self.assertEqual((1,), dataset_ops.get_legacy_output_shapes(dataset)["bar"]) for i in range(3): results = self.evaluate(get_next()) self.assertEqual(components["foo"][i], results["foo"]) self.assertEqual(components["bar"][i], results["bar"]) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testDictInputs(self): elements = [{ "foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]] }, { "foo": [4, 5, 6], "bar": [[7.0], [8.0], [9.0]] }] dataset = from_list.from_list(elements) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["foo"]) self.assertEqual(dtypes.float32, dataset_ops.get_legacy_output_types(dataset)["bar"]) self.assertEqual((3, ), dataset_ops.get_legacy_output_shapes(dataset)["foo"]) self.assertEqual((3, 1), dataset_ops.get_legacy_output_shapes(dataset)["bar"]) self.assertDatasetProduces(dataset, expected_output=elements)
def _apply_fn(dataset): output_shapes = _merge_output_shapes( dataset_ops.get_legacy_output_shapes(dataset), expected_shapes) # pylint: disable=protected-access return batching._RestructuredDataset( dataset.map(_check_shape), dataset_ops.get_legacy_output_types(dataset), output_shapes=output_shapes, output_classes=dataset_ops.get_legacy_output_classes(dataset))
def testCounter(self, start, step, expected_output): """Test dataset construction using `count`.""" dataset = counter.Counter(start, step) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset).as_list()) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) get_next = self.getNext(dataset) for expected in expected_output: self.assertEqual(expected, self.evaluate(get_next()))
def testFromTensorSlicesWithDict(self): components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]} dataset = dataset_ops.Dataset.from_tensor_slices(components) get_next = self.getNext(dataset) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["foo"]) self.assertEqual(dtypes.float32, dataset_ops.get_legacy_output_types(dataset)["bar"]) self.assertEqual((), dataset_ops.get_legacy_output_shapes(dataset)["foo"]) self.assertEqual((1,), dataset_ops.get_legacy_output_shapes(dataset)["bar"]) for i in range(3): results = self.evaluate(get_next()) self.assertEqual(components["foo"][i], results["foo"]) self.assertEqual(components["bar"][i], results["bar"]) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def _apply_fn(dataset): output_shapes = _merge_output_shapes( dataset_ops.get_legacy_output_shapes(dataset), expected_shapes) # pylint: disable=protected-access return batching._RestructuredDataset( dataset.map(_check_shape), dataset_ops.get_legacy_output_types(dataset), output_shapes=output_shapes, output_classes=dataset_ops.get_legacy_output_classes(dataset))
def testIteratorStringHandle(self): dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40]) iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3) iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4) handle_placeholder = array_ops.placeholder(dtypes.string, shape=[]) feedable_iterator = iterator_ops.Iterator.from_string_handle( handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3), dataset_ops.get_legacy_output_shapes(dataset_3)) next_element = feedable_iterator.get_next() self.assertTrue(dataset_ops.get_structure(dataset_3).is_compatible_with( dataset_ops.get_structure(feedable_iterator))) self.assertTrue(dataset_ops.get_structure(dataset_4).is_compatible_with( dataset_ops.get_structure(feedable_iterator))) with self.cached_session() as sess: iterator_3_handle = sess.run(iterator_3.string_handle()) iterator_4_handle = sess.run(iterator_4.string_handle()) self.assertEqual(10, sess.run( next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual(1, sess.run( next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual(20, sess.run( next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual(2, sess.run( next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual(30, sess.run( next_element, feed_dict={handle_placeholder: iterator_4_handle})) self.assertEqual(3, sess.run( next_element, feed_dict={handle_placeholder: iterator_3_handle})) self.assertEqual(40, sess.run( next_element, feed_dict={handle_placeholder: iterator_4_handle})) with self.assertRaises(errors.OutOfRangeError): sess.run( next_element, feed_dict={handle_placeholder: iterator_3_handle}) with self.assertRaises(errors.OutOfRangeError): sess.run( next_element, feed_dict={handle_placeholder: iterator_4_handle})
def testReinitializableIterator(self): dataset_3 = dataset_ops.Dataset.from_tensors( constant_op.constant([1, 2, 3])) dataset_4 = dataset_ops.Dataset.from_tensors( constant_op.constant([4, 5, 6, 7])) iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(dataset_3), [None]) dataset_3_init_op = iterator.make_initializer(dataset_3) dataset_4_init_op = iterator.make_initializer(dataset_4) get_next = iterator.get_next() self.assertEqual( dataset_ops.get_legacy_output_types(dataset_3), dataset_ops.get_legacy_output_types(iterator)) self.assertEqual( dataset_ops.get_legacy_output_types(dataset_4), dataset_ops.get_legacy_output_types(iterator)) self.assertEqual( [None], dataset_ops.get_legacy_output_shapes(iterator).as_list()) with self.cached_session() as sess: # The iterator is initially uninitialized. with self.assertRaises(errors.FailedPreconditionError): sess.run(get_next) # Initialize with one dataset. sess.run(dataset_3_init_op) self.assertAllEqual([1, 2, 3], sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Initialize with a different dataset. sess.run(dataset_4_init_op) self.assertAllEqual([4, 5, 6, 7], sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Reinitialize with the first dataset. sess.run(dataset_3_init_op) self.assertAllEqual([1, 2, 3], sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testNestedDict(self): components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]} dataset = dataset_ops.Dataset.from_tensors(components) self.assertEqual( dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["a"]["aa"]) self.assertEqual( dtypes.float32, dataset_ops.get_legacy_output_types(dataset)["a"]["ab"]) self.assertEqual(dtypes.int32, dataset_ops.get_legacy_output_types(dataset)["b"]) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset)["a"]["aa"]) self.assertEqual( [2], dataset_ops.get_legacy_output_shapes(dataset)["a"]["ab"]) self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset)["b"])
def testReinitializableIteratorEmptyDataset(self): dataset = dataset_ops.Dataset.range(0) iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(dataset), []) init_op = iterator.make_initializer(dataset) with self.cached_session() as sess: sess.run(init_op) with self.assertRaises(errors.OutOfRangeError): sess.run(iterator.get_next())
def __init__(self, filenames, compression_type=None, buffer_size=None, num_parallel_reads=None): """Creates a `TFRecordDataset` to read for one or more TFRecord files. Args: filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or more filenames. compression_type: (Optional.) A `tf.string` scalar evaluating to one of `""` (no compression), `"ZLIB"`, or `"GZIP"`. buffer_size: (Optional.) A `tf.int64` scalar representing the number of bytes in the read buffer. If your input pipeline is I/O bottlenecked, consider setting this parameter to a value 1-100 MBs. If `None`, a sensible default for both local and remote file systems is used. num_parallel_reads: (Optional.) A `tf.int64` scalar representing the number of files to read in parallel. If greater than one, the records of files read in parallel are outputted in an interleaved order. If your input pipeline is I/O bottlenecked, consider setting this parameter to a value greater than one to parallelize the I/O. If `None`, files will be read sequentially. Raises: TypeError: If any argument does not have the expected type. ValueError: If any argument does not have the expected shape. """ if isinstance(filenames, dataset_ops.DatasetV2): if dataset_ops.get_legacy_output_types(filenames) != dtypes.string: raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.") if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with( tensor_shape.scalar()): raise ValueError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") else: filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string) filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.Dataset.from_tensor_slices(filenames) self._filenames = filenames self._compression_type = compression_type self._buffer_size = buffer_size self._num_parallel_reads = num_parallel_reads def read_one_file(filename): return _TFRecordDataset(filename, compression_type, buffer_size) if num_parallel_reads is None: self._impl = filenames.flat_map(read_one_file) else: self._impl = ParallelInterleaveDataset( filenames, read_one_file, cycle_length=num_parallel_reads, block_length=1, sloppy=False, buffer_output_elements=None, prefetch_input_elements=None) variant_tensor = self._impl._variant_tensor # pylint: disable=protected-access super(TFRecordDatasetV2, self).__init__(variant_tensor)
def __init__(self, selector_input, data_inputs, stop_on_empty_dataset=False): self._selector_input = selector_input self._data_inputs = list(data_inputs) self._stop_on_empty_dataset = stop_on_empty_dataset first_output_types = dataset_ops.get_legacy_output_types(data_inputs[0]) first_output_classes = dataset_ops.get_legacy_output_classes(data_inputs[0]) for i, data_input in enumerate(data_inputs[1:]): if (dataset_ops.get_legacy_output_types(data_input) != first_output_types or dataset_ops.get_legacy_output_classes(data_input) != first_output_classes): raise TypeError("All datasets must have the same type and class.\n" "dataset 0 vs dataset %s types: %s ; %s\n" "classes: %s ; %s" % (i + 1, first_output_types, dataset_ops.get_legacy_output_types(data_input), first_output_classes, dataset_ops.get_legacy_output_classes(data_input))) output_shapes = dataset_ops.get_legacy_output_shapes(self._data_inputs[0]) for data_input in self._data_inputs[1:]: output_shapes = nest.pack_sequence_as(output_shapes, [ ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip( nest.flatten(output_shapes), nest.flatten(dataset_ops.get_legacy_output_shapes(data_input))) ]) self._element_spec = structure.convert_legacy_structure( first_output_types, output_shapes, first_output_classes) compat_kwargs = {} if compat.forward_compatible(2021, 5, 14) or self._stop_on_empty_dataset: compat_kwargs["stop_on_empty_dataset"] = self._stop_on_empty_dataset # pylint: disable=protected-access variant_tensor = ( gen_experimental_dataset_ops.directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **compat_kwargs, **self._flat_structure)) super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
def testUintInputs(self): elements = [(np.tile(np.array([[0], [1]], dtype=np.uint8), 2), np.tile(np.array([[2], [256]], dtype=np.uint16), 2), np.tile(np.array([[4], [65536]], dtype=np.uint32), 2), np.tile(np.array([[8], [4294967296]], dtype=np.uint64), 2))] dataset = from_list.from_list(elements) self.assertEqual( (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64), dataset_ops.get_legacy_output_types(dataset)) self.assertDatasetProduces(dataset, elements)
def print_info_data(dataset, print_example=True, n_example=3): # function to print data structure/shape about glue tensorflow dataset print('# Structure of the data:\n\n {}'.format(dataset)) print('\n# Output shape of one entry:\n {}'.format(dataset_ops.get_legacy_output_shapes(dataset))) print('\n# Output types of one entry:\n {}'.format(dataset_ops.get_legacy_output_types(dataset))) print('\n# Output typesof one entry:\n {}'.format(dataset_ops.get_legacy_output_classes(dataset))) print(' \n') np_array = np.array(list(dataset.as_numpy_iterator())) print('# Shape of the data:\n\n {}'.format(np.shape(np_array))) if len(np_array) > 0: if type(np_array[0]) is dict: structure = list(np_array[0].keys()) print(' ---> {} entries'.format(np.shape(np_array)[0])) print(' ---> {} dim'.format(np_array.ndim)) print(' dict structure') print(' dim: {}'.format(len(structure))) print(' [{:9} / {:9} / {:9}]'.format(structure[0], structure[1], structure[2])) print(' [{:9} / {:9} / {:9}]'.format(str(np.shape(np_array[0].get(structure[0]))), str(np.shape(np_array[0].get(structure[1]))), str(np.shape(np_array[0].get(structure[2]))))) print(' [{:9} / {:9} / {:9}]'.format(type(np_array[0].get(structure[0])).__name__, type(np_array[0].get(structure[1])).__name__, type(np_array[0].get(structure[2])).__name__)) if type(np_array[0]) is np.ndarray: if type(np_array[0][0]) is dict: structure = list(np_array[0][0].keys()) print(' ---> {} batches'.format(np.shape(np_array)[0])) print(' ---> {} dim'.format(np_array.ndim)) print(' label') print(' shape: {}'.format(np_array[0][1].shape)) print(' dict structure') print(' dim: {}'.format(len(structure))) print(' [{:15} / {:15} / {:15}]'.format(structure[0], structure[1], structure[2])) print(' [{:15} / {:15} / {:15}]'.format(str(np_array[0][0].get(structure[0]).shape), str(np_array[0][0].get(structure[1]).shape), str(np_array[0][0].get(structure[2]).shape))) print(' [{:15} / {:15} / {:15}]'.format(type(np_array[0][0].get(structure[0])).__name__, type(np_array[0][0].get(structure[1])).__name__, type(np_array[0][0].get(structure[2])).__name__)) else: print(' ---> {} entries'.format(np.shape(np_array)[0])) print(' ---> {} dim'.format(np_array.ndim)) print(' [{:15} / {:15} ]'.format('text', 'label')) print(' [{:15} / {:15} ]'.format(str(np_array[0][0].shape), str(np_array[0][1].shape))) print(' [{:15} / {:15} ]'.format(str(np_array[0][0].dtype), str(np_array[0][1].dtype))) if print_example: print('\n\n# Examples of data:') for i, ex in enumerate(np_array): print('{}'.format(pprint.pformat(ex))) if i + 1 > n_example: break
def testUnbatchScalarDataset(self): data = tuple([math_ops.range(10) for _ in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) expected_types = (dtypes.int32, ) * 3 data = data.batch(2) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) data = data.apply(batching.unbatch()) self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data)) iterator = data.make_one_shot_iterator() op = iterator.get_next() with self.cached_session() as sess: for i in range(10): self.assertEqual((i, ) * 3, sess.run(op)) with self.assertRaises(errors.OutOfRangeError): sess.run(op)
def __init__(self, selector_input, data_inputs): self._selector_input = selector_input self._data_inputs = list(data_inputs) first_output_types = dataset_ops.get_legacy_output_types( data_inputs[0]) first_output_classes = dataset_ops.get_legacy_output_classes( data_inputs[0]) for i, data_input in enumerate(data_inputs[1:]): if (dataset_ops.get_legacy_output_types(data_input) != first_output_types or dataset_ops.get_legacy_output_classes(data_input) != first_output_classes): raise TypeError( f"All datasets must have the same type and class.\n" f"dataset 0 types vs dataset {i+1} types: %s; %s\n" f"dataset 0 classes vs dataset {i+1} classes: %s; %s" % (first_output_types, dataset_ops.get_legacy_output_types(data_input), first_output_classes, dataset_ops.get_legacy_output_classes(data_input))) output_shapes = dataset_ops.get_legacy_output_shapes( self._data_inputs[0]) for data_input in self._data_inputs[1:]: output_shapes = nest.pack_sequence_as(output_shapes, [ ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip( nest.flatten(output_shapes), nest.flatten( dataset_ops.get_legacy_output_shapes(data_input))) ]) self._element_spec = structure.convert_legacy_structure( first_output_types, output_shapes, first_output_classes) # pylint: disable=protected-access variant_tensor = gen_experimental_dataset_ops.directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **self._flat_structure) super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
def __init__(self, filenames, compression_type=None, buffer_size=None, num_parallel_reads=None): """Creates a `TFRecordDataset` to read one or more TFRecord files. NOTE: The `num_parallel_reads` argument can be used to improve performance when reading from a remote filesystem. Args: filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or more filenames. compression_type: (Optional.) A `tf.string` scalar evaluating to one of `""` (no compression), `"ZLIB"`, or `"GZIP"`. buffer_size: (Optional.) A `tf.int64` scalar representing the number of bytes in the read buffer. 0 means no buffering. num_parallel_reads: (Optional.) A `tf.int64` scalar representing the number of files to read in parallel. Defaults to reading files sequentially. Raises: TypeError: If any argument does not have the expected type. ValueError: If any argument does not have the expected shape. """ if isinstance(filenames, dataset_ops.DatasetV2): if dataset_ops.get_legacy_output_types(filenames) != dtypes.string: raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.") if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with( tensor_shape.scalar()): raise ValueError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") else: filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string) filenames = array_ops.reshape(filenames, [-1], name="flat_filenames") filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames) self._filenames = filenames self._compression_type = compression_type self._buffer_size = buffer_size self._num_parallel_reads = num_parallel_reads def read_one_file(filename): return _TFRecordDataset(filename, compression_type, buffer_size) if num_parallel_reads is None: self._impl = filenames.flat_map(read_one_file) else: self._impl = ParallelInterleaveDataset( filenames, read_one_file, cycle_length=num_parallel_reads, block_length=1, sloppy=False, buffer_output_elements=None, prefetch_input_elements=None) variant_tensor = self._impl._variant_tensor # pylint: disable=protected-access super(TFRecordDatasetV2, self).__init__(variant_tensor)
def __init__(self, input_dataset): """See `unique()` for details.""" self._input_dataset = input_dataset if dataset_ops.get_legacy_output_types(input_dataset) not in ( dtypes.int32, dtypes.int64, dtypes.string): raise TypeError( "`tf.data.experimental.unique()` only supports inputs with a single " "`tf.int32`, `tf.int64`, or `tf.string` component.") variant_tensor = gen_experimental_dataset_ops.experimental_unique_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access **dataset_ops.flat_structure(self)) super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
def __init__(self, input_dataset, batch_size, row_shape): """See `Dataset.dense_to_sparse_batch()` for more details.""" if not isinstance( dataset_ops.get_legacy_output_types(input_dataset), dtypes.DType): raise TypeError("DenseToSparseDataset requires an input whose elements " "have a single component, whereas the input has %r." % dataset_ops.get_legacy_output_types(input_dataset)) self._input_dataset = input_dataset self._batch_size = batch_size self._row_shape = row_shape self._structure = structure.SparseTensorStructure( dataset_ops.get_legacy_output_types(input_dataset), tensor_shape.vector(None).concatenate(self._row_shape)) variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access self._batch_size, row_shape=convert.partial_shape_to_tensor(self._row_shape), **dataset_ops.flat_structure(self)) super(_DenseToSparseBatchDataset, self).__init__(input_dataset, variant_tensor)
def _next_func(string_handle): """Calls get_next for created iterator. Args: string_handle: An iterator string handle created by _init_func Returns: The elements generated from `input_dataset` """ with ops.device(self._source_device_string): iterator = iterator_ops.Iterator.from_string_handle( string_handle, dataset_ops.get_legacy_output_types(self), dataset_ops.get_legacy_output_shapes(self), dataset_ops.get_legacy_output_classes(self)) return self._element_structure._to_tensor_list(iterator.get_next()) # pylint: disable=protected-access
def __init__(self, selector_input, data_inputs): self._selector_input = selector_input self._data_inputs = list(data_inputs) first_output_types = dataset_ops.get_legacy_output_types(data_inputs[0]) first_output_classes = dataset_ops.get_legacy_output_classes(data_inputs[0]) for data_input in data_inputs[1:]: if (dataset_ops.get_legacy_output_types(data_input) != first_output_types or dataset_ops.get_legacy_output_classes(data_input) != first_output_classes): raise TypeError("All datasets must have the same type and class.") output_shapes = dataset_ops.get_legacy_output_shapes(self._data_inputs[0]) for data_input in self._data_inputs[1:]: output_shapes = nest.pack_sequence_as(output_shapes, [ ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip( nest.flatten(output_shapes), nest.flatten(dataset_ops.get_legacy_output_shapes(data_input))) ]) self._structure = structure.convert_legacy_structure( first_output_types, output_shapes, first_output_classes) super(_DirectedInterleaveDataset, self).__init__()
def testEnumerate(self): components = (["a", "b"], [1, 2], [37.0, 38]) start = constant_op.constant(20, dtype=dtypes.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).enumerate( start) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)[0]) dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset) self.assertEqual((), dataset_output_shapes[0]) self.assertEqual([tensor_shape.TensorShape([])] * 3, [shape for shape in dataset_output_shapes[1]]) self.assertDatasetProduces(dataset, [(20, (b"a", 1, 37.0)), (21, (b"b", 2, 38.0))])
def testCounter(self): """Test dataset construction using `count`.""" dataset = counter.Counter(start=3, step=4) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset).as_list()) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) get_next = self.getNext(dataset) negative_dataset = counter.Counter(start=0, step=-1) negative_get_next = self.getNext(negative_dataset) self.assertEqual(3, self.evaluate(get_next())) self.assertEqual(3 + 4, self.evaluate(get_next())) self.assertEqual(3 + 2 * 4, self.evaluate(get_next())) self.assertEqual(0, self.evaluate(negative_get_next())) self.assertEqual(-1, self.evaluate(negative_get_next())) self.assertEqual(-2, self.evaluate(negative_get_next()))
def testIteratorStructure(self, tf_value_fn, expected_element_structure, expected_output_classes, expected_output_types, expected_output_shapes): tf_value = tf_value_fn() iterator = dataset_ops.make_one_shot_iterator( dataset_ops.Dataset.from_tensors(tf_value)) self.assertTrue(expected_element_structure.is_compatible_with( iterator._element_structure)) self.assertTrue(iterator._element_structure.is_compatible_with( expected_element_structure)) self.assertEqual(expected_output_classes, dataset_ops.get_legacy_output_classes(iterator)) self.assertEqual(expected_output_types, dataset_ops.get_legacy_output_types(iterator)) self.assertEqual(expected_output_shapes, dataset_ops.get_legacy_output_shapes(iterator))
def MapFn(unused_input): source_dataset_output_types = dataset_ops.get_legacy_output_types( source_dataset) if isinstance(source_dataset_output_types, dtypes.DType): output_types = [source_dataset_output_types] elif isinstance(source_dataset_output_types, (list, tuple)): output_types = source_dataset_output_types else: raise ValueError('source dataset has invalid output types') remote_calls = functional_ops.remote_call( args=[source_handle], Tout=output_types, f=LoadingFunc, target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job) if len(remote_calls) == 1: return remote_calls[0] else: return remote_calls
def testRemoteIteratorWithoutRemoteCallFail(self): worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 worker, _ = test_util.create_local_cluster( 1, 1, worker_config=worker_config) with ops.device("/job:worker/replica:0/task:0/cpu:1"): dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3) iterator_3_handle = iterator_3.string_handle() with ops.device("/job:worker/replica:0/task:0/cpu:0"): remote_it = iterator_ops.Iterator.from_string_handle( iterator_3_handle, dataset_ops.get_legacy_output_types(dataset_3), dataset_ops.get_legacy_output_shapes(dataset_3)) get_next_op = remote_it.get_next() with session.Session(worker[0].target) as sess: with self.assertRaises(errors.InvalidArgumentError): sess.run(get_next_op)
def write(self, dataset): """Returns a `tf.Operation` to write a dataset to a file. Args: dataset: a `tf.data.Dataset` whose elements are to be written to a file Returns: A `tf.Operation` that, when run, writes contents of `dataset` to a file. """ if not isinstance(dataset, dataset_ops.DatasetV2): raise TypeError("`dataset` must be a `tf.data.Dataset` object.") if not dataset_ops.get_structure(dataset).is_compatible_with( structure.TensorStructure(dtypes.string, [])): raise TypeError( "`dataset` must produce scalar `DT_STRING` tensors whereas it " "produces shape {0} and types {1}".format( dataset_ops.get_legacy_output_shapes(dataset), dataset_ops.get_legacy_output_types(dataset))) return gen_experimental_dataset_ops.experimental_dataset_to_tf_record( dataset._variant_tensor, self._filename, self._compression_type) # pylint: disable=protected-access
def testKinesisDatasetOneShard(self): client = boto3.client('kinesis', region_name='us-east-1') # Setup the Kinesis with 1 shard. stream_name = "tf_kinesis_test_1" client.create_stream(StreamName=stream_name, ShardCount=1) # Wait until stream exists, default is 10 * 18 seconds. client.get_waiter('stream_exists').wait(StreamName=stream_name) for i in range(10): data = "D" + str(i) client.put_record( StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i)) stream = array_ops.placeholder(dtypes.string, shape=[]) num_epochs = array_ops.placeholder(dtypes.int64, shape=[]) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) repeat_dataset = kinesis_dataset_ops.KinesisDataset( stream, read_indefinitely=False).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = iterator_ops.Iterator.from_structure( dataset_ops.get_legacy_output_types(batch_dataset)) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read from shard 0 of stream 1. sess.run(init_op, feed_dict={stream: stream_name, num_epochs: 1}) for i in range(10): self.assertEqual("D" + str(i), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) client.delete_stream(StreamName=stream_name) # Wait until stream deleted, default is 10 * 18 seconds. client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
def testRestructureDataset(self): components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder(dtypes.int32, shape=[None]), array_ops.placeholder(dtypes.int32, shape=[20, 30]))) dataset = dataset_ops.Dataset.from_tensors(components) i32 = dtypes.int32 test_cases = [((i32, i32, i32), None), (((i32, i32), i32), None), ((i32, i32, i32), (None, None, None)), ((i32, i32, i32), ([17], [17], [20, 30]))] for new_types, new_shape_lists in test_cases: # pylint: disable=protected-access new = batching._RestructuredDataset(dataset, new_types, new_shape_lists) # pylint: enable=protected-access self.assertEqual(new_types, dataset_ops.get_legacy_output_types(new)) if new_shape_lists is not None: for expected_shape_list, shape in zip( nest.flatten(new_shape_lists), nest.flatten(dataset_ops.get_legacy_output_shapes(new))): if expected_shape_list is None: self.assertIs(None, shape.ndims) else: self.assertEqual(expected_shape_list, shape.as_list()) fail_cases = [((i32, dtypes.int64, i32), None), ((i32, i32, i32, i32), None), ((i32, i32, i32), ((None, None), None)), ((i32, i32, i32), (None, None, None, None)), ((i32, i32, i32), (None, [None], [21, 30]))] for new_types, new_shape_lists in fail_cases: with self.assertRaises(ValueError): # pylint: disable=protected-access new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
def __init__(self, input_dataset): """See `unbatch()` for more details.""" input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset) flat_shapes = nest.flatten(input_shapes) if any(s.ndims == 0 for s in flat_shapes): raise ValueError("Cannot unbatch an input with scalar components.") known_batch_dim = tensor_shape.Dimension(None) for s in flat_shapes: try: known_batch_dim = known_batch_dim.merge_with(s[0]) except ValueError: raise ValueError("Cannot unbatch an input whose components have " "different batch sizes.") self._input_dataset = input_dataset self._structure = structure.convert_legacy_structure( dataset_ops.get_legacy_output_types(input_dataset), nest.map_structure(lambda s: s[1:], input_shapes), dataset_ops.get_legacy_output_classes(input_dataset)) variant_tensor = ged_ops.experimental_unbatch_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access **dataset_ops.flat_structure(self)) super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
def _remote_fn(h): handle = script_ops.py_func(_encode_raw, [h], dtypes.string) remote_iterator = iterator_ops.Iterator.from_string_handle( handle, dataset_ops.get_legacy_output_types(dataset_3), dataset_ops.get_legacy_output_shapes(dataset_3)) return remote_iterator.get_next()
def loading_func(h): remote_itr = iterator_ops.Iterator.from_string_handle( h, dataset_ops.get_legacy_output_types(itr), dataset_ops.get_legacy_output_shapes(itr)) return remote_itr.get_next()