def parse_example_dataset(features, num_parallel_calls=1): """A transformation that parses `Example` protos into a `dict` of tensors. Parses a number of serialized `Example` protos given in `serialized`. We refer to `serialized` as a batch with `batch_size` many entries of individual `Example` protos. This op parses serialized examples into a dictionary mapping keys to `Tensor` and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` and `SparseFeature` is mapped to a `SparseTensor`, and each `FixedLenFeature` is mapped to a `Tensor`. See `tf.io.parse_example` for more details about feature dictionaries. Args: features: A `dict` mapping feature keys to `FixedLenFeature`, `VarLenFeature`, and `SparseFeature` values. num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`, representing the number of parsing processes to call in parallel. Returns: A dataset transformation function, which can be passed to `tf.data.Dataset.apply`. Raises: ValueError: if features argument is None. """ return parsing_ops.parse_example_dataset(features, num_parallel_calls)
def testDeterminism(self, local_determinism, global_determinism): num_elements = 1000 batches = [] for i in range(num_elements): example_i = example(features=features({ "a": int64_feature([i]), })) batches.append([example_i.SerializeToString()]) test_features = {"a": parsing_ops.FixedLenFeature((), dtype=dtypes.int64)} dataset = dataset_ops.Dataset.from_tensor_slices(batches) dataset = dataset.apply( contrib_parsing_ops.parse_example_dataset( test_features, num_parallel_calls=10, deterministic=local_determinism)) opts = dataset_ops.Options() opts.experimental_deterministic = global_determinism dataset = dataset.with_options(opts) expected = list(range(num_elements)) actual = [elem["a"][0] for elem in self.getDatasetOutput(dataset)] require_order = local_determinism or (local_determinism is None and global_determinism) if require_order: self.assertAllEqual(expected, actual) else: self.assertCountEqual(expected, actual)
def parse_example_dataset(features, num_parallel_calls=1): """A transformation that parses `Example` protos into a `dict` of tensors. Parses a number of serialized `Example` protos given in `serialized`. We refer to `serialized` as a batch with `batch_size` many entries of individual `Example` protos. This op parses serialized examples into a dictionary mapping keys to `Tensor` and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` and `SparseFeature` is mapped to a `SparseTensor`, and each `FixedLenFeature` is mapped to a `Tensor`. See `tf.io.parse_example` for more details about feature dictionaries. Args: features: A `dict` mapping feature keys to `FixedLenFeature`, `VarLenFeature`, and `SparseFeature` values. num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`, representing the number of parsing processes to call in parallel. Returns: A dataset transformation function, which can be passed to `tf.data.Dataset.apply`. Raises: ValueError: if features argument is None. """ return parsing_ops.parse_example_dataset(features, num_parallel_calls)
def _test(self, input_tensor, feature_val, expected_values=None, expected_err=None, create_iterator_twice=False): if expected_err: with self.assertRaisesWithPredicateMatch(expected_err[0], expected_err[1]): dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) self.evaluate(get_next()) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) if create_iterator_twice: get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) # Check shapes; if serialized is a Tensor we need its size to # properly check. batch_size = (self.evaluate(input_tensor).size if isinstance( input_tensor, ops.Tensor) else np.asarray(input_tensor).size) for k, f in feature_val.items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list() [0], batch_size) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list() [1], None)
def _test(self, input_tensor, feature_val, expected_values=None, expected_err=None, create_iterator_twice=False): if expected_err: with self.assertRaisesWithPredicateMatch(expected_err[0], expected_err[1]): dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) self.evaluate(get_next()) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) if create_iterator_twice: get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) # Check shapes; if serialized is a Tensor we need its size to # properly check. batch_size = ( self.evaluate(input_tensor).size if isinstance(input_tensor, ops.Tensor) else np.asarray(input_tensor).size) for k, f in feature_val.items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[0], batch_size) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[1], None)
def _test(self, input_tensor, feature_val, expected_values=None, expected_err=None): with self.cached_session() as sess: if expected_err: with self.assertRaisesWithPredicateMatch( expected_err[0], expected_err[1]): dataset = dataset_ops.Dataset.from_tensors( input_tensor).apply( contrib_parsing_ops.parse_example_dataset( feature_val)) get_next = dataset.make_one_shot_iterator().get_next() sess.run(get_next) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = dataset.make_one_shot_iterator().get_next() result = sess.run(get_next) flattened = nest.flatten(result) print("result", result, "expected_values", expected_values) _compare_output_to_expected(self, result, expected_values, flattened) # Check shapes; if serialized is a Tensor we need its size to # properly check. batch_size = (input_tensor.eval().size if isinstance( input_tensor, ops.Tensor) else np.asarray(input_tensor).size) for k, f in feature_val.items(): print("output_shapes as list ", tuple(dataset.output_shapes[k].as_list())) if isinstance( f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
def _test(self, input_tensor, feature_val, expected_values=None, expected_err=None): with self.cached_session() as sess: if expected_err: with self.assertRaisesWithPredicateMatch(expected_err[0], expected_err[1]): dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = dataset.make_one_shot_iterator().get_next() sess.run(get_next) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = dataset.make_one_shot_iterator().get_next() result = sess.run(get_next) flattened = nest.flatten(result) print("result", result, "expected_values", expected_values) _compare_output_to_expected(self, result, expected_values, flattened) # Check shapes; if serialized is a Tensor we need its size to # properly check. batch_size = ( input_tensor.eval().size if isinstance(input_tensor, ops.Tensor) else np.asarray(input_tensor).size) for k, f in feature_val.items(): print("output_shapes as list ", tuple(dataset.output_shapes[k].as_list())) if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
def _test(self, input_tensor, feature_val, expected_values=None, expected_err=None): with self.cached_session() as sess: if expected_err: with self.assertRaisesWithPredicateMatch(expected_err[0], expected_err[1]): dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = dataset.make_one_shot_iterator().get_next() sess.run(get_next) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) self.evaluate(get_next()) return else:
def make_batched_features_dataset_v2(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, label_key=None, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=optimization.AUTOTUNE, reader_num_threads=1, parser_num_threads=2, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. If label_key argument is provided, returns a `Dataset` of tuple comprising of feature dictionaries and label. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.io.gfile.glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.io.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. label_key: (Optional) A string corresponding to the key labels are stored in `tf.Examples`. If provided, it must be one of the `features` key, otherwise results in `ValueError`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step. Defaults to auto-tune. reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. Raises: TypeError: If `reader` is a `tf.compat.v1.ReaderBase` subclass. ValueError: If `label_key` is not one of the `features` keys. """ # Create dataset of all matching filenames filenames = _get_file_names(file_pattern, False) dataset = dataset_ops.Dataset.from_tensor_slices(filenames) if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase): raise TypeError("The `reader` argument must return a `Dataset` object. " "`tf.ReaderBase` subclasses are not supported. For " "example, pass `tf.data.TFRecordDataset` instead of " "`tf.TFRecordReader`.") # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel dataset = dataset.apply( interleave_ops.parallel_interleave( lambda filename: reader(filename, *reader_args), cycle_length=reader_num_threads, sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset_ops.get_legacy_output_types(dataset) == ( dtypes.string, dtypes.string): dataset = dataset_ops.MapDataset( dataset, lambda _, v: v, use_inter_op_parallelism=False) # Apply dataset repeat and shuffle transformations. dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to # improve the shape inference, because it makes the batch dimension static. # It is safe to do this because in that case we are repeating the input # indefinitely, and all batches will be full-sized. dataset = dataset.batch( batch_size, drop_remainder=drop_final_batch or num_epochs is None) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.apply( parsing_ops.parse_example_dataset( features, num_parallel_calls=parser_num_threads)) if label_key: if label_key not in features: raise ValueError( "The `label_key` provided (%r) must be one of the `features` keys." % label_key) dataset = dataset.map(lambda x: (x, x.pop(label_key))) dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def make_batched_features_dataset(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, label_key=None, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=optimization.AUTOTUNE, reader_num_threads=1, parser_num_threads=2, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. If label_key argument is provided, returns a `Dataset` of tuple comprising of feature dictionaries and label. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. label_key: (Optional) A string corresponding to the key labels are stored in `tf.Examples`. If provided, it must be one of the `features` key, otherwise results in `ValueError`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step. Defaults to auto-tune. reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. Raises: ValueError: If `label_key` is not one of the `features` keys. """ # Create dataset of all matching filenames filenames = _get_file_names(file_pattern, False) dataset = dataset_ops.Dataset.from_tensor_slices(filenames) if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel dataset = dataset.apply( interleave_ops.parallel_interleave( lambda filename: reader(filename, *reader_args), cycle_length=reader_num_threads, sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset.output_types == (dtypes.string, dtypes.string): dataset = dataset_ops.MapDataset( dataset, lambda _, v: v, use_inter_op_parallelism=False) # Apply dataset repeat and shuffle transformations. dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to # improve the shape inference, because it makes the batch dimension static. # It is safe to do this because in that case we are repeating the input # indefinitely, and all batches will be full-sized. dataset = dataset.batch( batch_size, drop_remainder=drop_final_batch or num_epochs is None) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.apply( parsing_ops.parse_example_dataset( features, num_parallel_calls=parser_num_threads)) if label_key: if label_key not in features: raise ValueError( "The `label_key` provided (%r) must be one of the `features` keys." % label_key) dataset = dataset.map(lambda x: (x, x.pop(label_key))) dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def make_batched_features_dataset_multi_task( file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, label_key=None, weight_key=None, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=optimization.AUTOTUNE, reader_num_threads=32, parser_num_threads=32, sloppy_ordering=True, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. """ if shuffle_seed is None: shuffle_seed = int(time.time()) filenames = list(gfile.Glob(file_pattern)) dataset = dataset_ops.Dataset.from_tensor_slices(filenames) if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel dataset = dataset.apply( interleave_ops.parallel_interleave( lambda filename: reader(filename, *reader_args), cycle_length=reader_num_threads, block_length=200, sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset_ops.get_legacy_output_types(dataset) == ( dtypes.string, dtypes.string): dataset = dataset_ops.MapDataset( dataset, lambda _, v: v, use_inter_op_parallelism=True) # Apply dataset repeat and shuffle transformations. dataset = dataset.apply( shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs, shuffle_seed)) dataset = dataset.batch( batch_size, drop_remainder=drop_final_batch or num_epochs is None) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.apply( parsing_ops.parse_example_dataset( features, num_parallel_calls=parser_num_threads)) if weight_key: #assert label_key #assert label_key != weight_key #assert label_key in features assert weight_key in features if label_key: if label_key not in features: raise ValueError( "The 'label_key' provided (%r) must be one of the 'features' keys."% label_key) assert label_key != weight_key dataset = dataset.map(lambda x: (x, tuple([x.pop(label_key)]*5),x.pop(weight_key))) #w = dataset.map(lambda x,y : x.pop(weight_key)) else: if label_key: if label_key not in features: raise ValueError( "The `label_key` provided (%r) must be one of the `features` keys." % label_key) dataset = dataset.map(lambda x: (x, tuple([x.pop(label_key)]*5))) dataset = dataset.prefetch(prefetch_buffer_size) if not weight_key: return dataset else: return dataset
get_next = dataset.make_one_shot_iterator().get_next() sess.run(get_next) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) self.evaluate(get_next()) return else: # Returns dict w/ Tensors and SparseTensors. # Check values. dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply( contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) if create_iterator_twice: get_next = self.getNext(dataset) result = self.evaluate(get_next()) self._compare_output_to_expected(result, expected_values) with self.assertRaises(errors_impl.OutOfRangeError): self.evaluate(get_next()) # Check shapes; if serialized is a Tensor we need its size to # properly check.