def _create_dataset( file_pattern: List[Text], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, is_train: bool = False, batch_size: int = 200 ) -> tf.data.Dataset: """create dataset Args: file_pattern (List[Text]): List of paths or patterns of input tfrecord files. data_accessor (DataAccessor): DataAccessor for converting input to RecordBatch. tf_transform_output (tft.TFTransformOutput): A TFTransformOutput. is_train (bool, optional): Whether the input dataset is train split or not. Defaults to False. batch_size (int, optional): representing the number of consecutive elements of returned dataset to combine in a single batch. Defaults to 200. Returns: tf.data.Dataset: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ dataset = data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=_transform_key_name(LABEL_KEY) ), tf_transform_output.transformed_metadata.schema ) if is_train: dataset = dataset.map(lambda x, y: (_data_augment(x), y)) return dataset
def _input_fn(file_pattern: List[str], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, is_train: bool = False, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. data_accessor: DataAccessor for converting input to RecordBatch. tf_transform_output: A TFTransformOutput. is_train: Whether the input dataset is train split or not. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ dataset = data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=_transformed_name(_LABEL_KEY)), tf_transform_output.transformed_metadata.schema) # Apply data augmentation. We have to do data augmentation here because # we need to apply data agumentation on-the-fly during training. If we put # it in Transform, it will only be applied once on the whole dataset, which # will lose the point of data augmentation. if is_train: dataset = dataset.map(lambda x, y: (_data_augmentation(x), y)) return dataset
def testTensorFlowDatasetGraphMode(self): column_name = "raw_record" tfxio = raw_tf_record.RawTfRecordTFXIO( self._raw_record_file, column_name, telemetry_descriptors=["some", "component"]) actual_records = [] with tf.compat.v1.Graph().as_default(): ds = tfxio.TensorFlowDataset( dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1, reader_num_threads=1, sloppy_ordering=False)) iterator = tf.compat.v1.data.make_one_shot_iterator(ds) next_elem = iterator.get_next() with tf.compat.v1.Session() as sess: while True: try: actual_records.append( sess.run(next_elem)[column_name][0]) except tf.errors.OutOfRangeError: break self.assertEqual(actual_records, _RAW_RECORDS)
def _input_fn(file_pattern: List[Text], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. data_accessor: DataAccessor for converting input to RecordBatch. tf_transform_output: A TFTransformOutput. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ dataset = data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=features.transformed_name(features.LABEL_KEY)), tf_transform_output.transformed_metadata.schema) return dataset
def ReadExamplesArtifact(self, examples: types.Artifact, num_examples: int, split_name: Optional[Text] = None): """Read records from Examples artifact. Currently it assumes Examples artifact contains serialized tf.Example in gzipped TFRecord files. Args: examples: `Examples` artifact. num_examples: Number of examples to read. If the specified value is larger than the actual number of examples, all examples would be read. split_name: Name of the split to read from the Examples artifact. Raises: RuntimeError: If read twice. """ if self._records: raise RuntimeError('Cannot read records twice.') if num_examples < 1: raise ValueError('num_examples < 1 (got {})'.format(num_examples)) available_splits = artifact_utils.decode_split_names( examples.split_names) if not available_splits: raise ValueError( 'No split_name is available in given Examples artifact.') if split_name is None: split_name = available_splits[0] if split_name not in available_splits: raise ValueError( 'No split_name {}; available split names: {}'.format( split_name, ', '.join(available_splits))) # ExampleGen generates artifacts under each split_name directory. glob_pattern = os.path.join(examples.uri, split_name, '*') tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[examples], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=None, read_as_raw_records=True, raw_record_column_name=_RAW_RECORDS_COLUMN) try: filenames = fileio.glob(glob_pattern) except tf.errors.NotFoundError: filenames = [] if not filenames: raise ValueError( 'Unable to find examples matching {}.'.format(glob_pattern)) self._payload_format = examples_utils.get_payload_format(examples) tfxio = tfxio_factory(filenames) self._ReadFromDataset( tfxio.TensorFlowDataset( dataset_options.TensorFlowDatasetOptions( batch_size=num_examples)))
def test_tensorflow_dataset_with_invalid_label_key(self): tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, ["some", "component"]) label_key = "invalid" options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1, label_key=label_key) with self.assertRaisesRegex(ValueError, "The `label_key` provided.*"): tfxio.TensorFlowDataset(options=options)
def test_tensorflow_dataset(self): tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, ["some", "component"]) options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1) for i, decoded_tensors_dict in enumerate( tfxio.TensorFlowDataset(options=options)): for key, tensor in decoded_tensors_dict.items(): self._AssertSparseTensorEqual(tensor, _RECORDS_AS_TENSORS[i][key])
def _input_fn(file_pattern, data_accessor, tf_transform_output, batch_size=200): return data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=transformed_name(LABEL_KEY)), tf_transform_output.transformed_metadata.schema)
def testTensorFlowDataset(self): tfxio = self._MakeTFXIO(_SCHEMA) options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1) for i, parsed_examples_dict in enumerate( tfxio.TensorFlowDataset(options=options)): self.assertLen(parsed_examples_dict, 3) for feature_name, tensor in parsed_examples_dict.items(): self._AssertSparseTensorEqual( tensor, _EXAMPLES_AS_TENSORS[i][feature_name])
def testTensorFlowDatasetWithLabelKey(self): tfxio = self._MakeTFXIO(_SCHEMA) options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1, label_key="string_feature") for i, (parsed_examples_dict, label_feature) in enumerate( tfxio.TensorFlowDataset(options=options)): self._AssertSparseTensorEqual( label_feature, _EXAMPLES_AS_TENSORS[i]["string_feature"]) self.assertLen(parsed_examples_dict, 2) for feature_name, tensor in parsed_examples_dict.items(): self._AssertSparseTensorEqual( tensor, _EXAMPLES_AS_TENSORS[i][feature_name])
def testProjectedTensorFlowDataset(self): tfxio = self._MakeTFXIO(_SCHEMA) feature_name = "string_feature" projected_tfxio = tfxio.Project([feature_name]) options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1) for i, parsed_examples_dict in enumerate( projected_tfxio.TensorFlowDataset(options=options)): self.assertIn(feature_name, parsed_examples_dict) self.assertLen(parsed_examples_dict, 1) self._AssertSparseTensorEqual(parsed_examples_dict[feature_name], _EXAMPLES_AS_TENSORS[i][feature_name])
def testTensorFlowDatasetWithTensorRepresentation(self): schema = text_format.Parse( """ feature { name: "int_feature" type: INT value_count { min: 1 max: 1 } } feature { name: "float_feature" type: FLOAT value_count { min: 4 max: 4 } } feature { name: "string_feature" type: BYTES value_count { min: 0 max: 2 } } tensor_representation_group { key: "" value { tensor_representation { key: "var_len_feature" value { varlen_sparse_tensor { column_name: "string_feature" } } } } } """, schema_pb2.Schema()) tfxio = self._MakeTFXIO(schema) options = dataset_options.TensorFlowDatasetOptions(batch_size=1, shuffle=False, num_epochs=1) for i, parsed_examples_dict in enumerate( tfxio.TensorFlowDataset(options=options)): self.assertLen(parsed_examples_dict, 1) for tensor_name, tensor in parsed_examples_dict.items(): self.assertEqual(tensor_name, "var_len_feature") self._AssertSparseTensorEqual( tensor, _EXAMPLES_AS_TENSORS[i]["string_feature"])
def input_fn( file_pattern: List[Text], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, batch_size: int = 200, ) -> tf.data.Dataset: return data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=module.transformed_name(module.LABEL_KEY)), tf_transform_output.transformed_metadata.schema, )
def test_tensorflow_dataset_with_label_key(self): decoder_path = _write_decoder() tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, decoder_path, ["some", "component"]) label_key = "st1" options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1, label_key=label_key) for i, (decoded_tensors_dict, label_feature) in enumerate( tfxio.TensorFlowDataset(options=options)): self._assert_sparse_tensor_equal( label_feature, _RECORDS_AS_TENSORS[i][label_key]) for key, tensor in decoded_tensors_dict.items(): self._assert_sparse_tensor_equal(tensor, _RECORDS_AS_TENSORS[i][key])
def test_projected_tensorflow_dataset(self): tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, ["some", "component"]) feature_name = "st1" projected_tfxio = tfxio.Project([feature_name]) options = dataset_options.TensorFlowDatasetOptions( batch_size=1, shuffle=False, num_epochs=1) for i, decoded_tensors_dict in enumerate( projected_tfxio.TensorFlowDataset(options=options)): self.assertIn(feature_name, decoded_tensors_dict) self.assertLen(decoded_tensors_dict, 1) tensor = decoded_tensors_dict[feature_name] self._AssertSparseTensorEqual(tensor, _RECORDS_AS_TENSORS[i][feature_name])
def testTensorFlowDataset(self): column_name = "raw_record" tfxio = raw_tf_record.RawTfRecordTFXIO( self._raw_record_file, column_name, telemetry_descriptors=["some", "component"]) ds = tfxio.TensorFlowDataset( dataset_options.TensorFlowDatasetOptions(batch_size=1, shuffle=False, num_epochs=1, reader_num_threads=1, sloppy_ordering=False)) actual_records = [d[column_name].numpy()[0] for d in ds] self.assertEqual(actual_records, _RAW_RECORDS)
def build_dataset(files): return ( fn_args.data_accessor.tf_dataset_factory( files, dataset_options.TensorFlowDatasetOptions(batch_size), schema, ) .map( lambda batch: ( Features(inputs).map(lambda name, _: batch[name]), Features(outputs).map(lambda name, _: batch[name]), ) ) .repeat() )
def _input_fn( file_pattern: List[str], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, batch_size: int = 200, ) -> tf.data.Dataset: dataset = data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key="class_xf", ), tf_transform_output.transformed_metadata.schema, ) return dataset.repeat()
def testTensorFlowDatasetGraphMode(self): tfxio = self._MakeTFXIO(_SCHEMA) options = dataset_options.TensorFlowDatasetOptions(batch_size=1, shuffle=False, num_epochs=1) with tf.compat.v1.Graph().as_default(): ds = tfxio.TensorFlowDataset(options=options) iterator = tf.compat.v1.data.make_one_shot_iterator(ds) next_elem = iterator.get_next() records = [] with tf.compat.v1.Session() as sess: while True: try: records.append(sess.run(next_elem)) except tf.errors.OutOfRangeError: break for i, parsed_examples_dict in enumerate(records): self.assertLen(parsed_examples_dict, 3) for tensor_name, tensor in parsed_examples_dict.items(): self._AssertSparseTensorEqual( tensor, _EXAMPLES_AS_TENSORS[i][tensor_name])
def _input_fn(file_pattern: List[Text], data_accessor: DataAccessor, schema: schema_pb2.Schema, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. data_accessor: DataAccessor for converting input to RecordBatch. schema: schema of the input data. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ return data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions(batch_size=batch_size, label_key=_LABEL_KEY), schema)
def _read_transformed_dataset( file_pattern, data_accessor, tf_transform_output, num_epochs=1, shuffle=False, sloppy_ordering=True, batch_size=100000, ): """ Read data coming out of Transformation component. Parameters ---------- file_pattern : list(str) List of paths or patterns of input tfrecord files. data_accessor : tfx.components.trainer.fn_args_utils.DataAccessor DataAccessor for converting input to RecordBatch. tf_transform_output : tft.TFTransformOutput A TFTransformOutput. Returns ------- tf.Dataset (iterable) An iterable dataset where each iteration returns a data batch as dictionary {"field1": array[...], "field1": array[...], ...} """ return data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( num_epochs=num_epochs, shuffle=shuffle, sloppy_ordering=sloppy_ordering, batch_size=int(batch_size), ), tf_transform_output.transformed_metadata.schema, )
def testTensorFlowDatasetWithRaggedTensorRepresentation(self): schema = text_format.Parse( """ feature { name: "varlen_feature" type: INT } feature { name: "row_lengths" type: INT } tensor_representation_group { key: "" value { tensor_representation { key: "ragged" value { ragged_tensor { feature_path { step: "varlen_feature" } partition { row_length: "row_lengths" } } } } } } """, schema_pb2.Schema()) tfxio = self._MakeTFXIO(schema) projected_tfxio = tfxio.Project(["ragged"]) expected_column_values = { "varlen_feature": pa.array([[1, 2, 3], [4], [5, 6]], type=pa.large_list(pa.int64())), "row_lengths": pa.array([[2, 1], [1], [1, 1]], type=pa.large_list(pa.int64())), } def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) print(record_batch.schema) for i, field in enumerate(record_batch.schema): self.assertTrue( record_batch.column(i).equals( expected_column_values[field.name]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), expected_column_values[field.name])) # self._ValidateRecordBatch(tfxio, record_batch) expected_schema = projected_tfxio.ArrowSchema() self.assertTrue( record_batch.schema.equals(expected_schema), "actual: {}; expected: {}".format(record_batch.schema, expected_schema)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("ragged", dict_of_tensors) if tf.executing_eagerly(): ragged_factory = tf.RaggedTensor.from_row_splits else: ragged_factory = tf.compat.v1.ragged.RaggedTensorValue expected_tensor = ragged_factory(values=ragged_factory( values=[1, 2, 3, 4, 5, 6], row_splits=[0, 2, 3, 4, 5, 6]), row_splits=[0, 2, 3, 5]) self.assertAllEqual(dict_of_tensors["ragged"], expected_tensor) with beam.Pipeline() as p: # Setting the betch_size to make sure only one batch is generated. record_batch_pcoll = p | projected_tfxio.BeamSource( batch_size=len(_EXAMPLES)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) if tf.executing_eagerly(): ragged_factory = tf.RaggedTensor.from_row_splits else: ragged_factory = tf.compat.v1.ragged.RaggedTensorValue expected_tensors = [ ragged_factory(values=ragged_factory(values=[1, 2, 3], row_splits=[0, 2, 3]), row_splits=[0, 2]), ragged_factory(values=ragged_factory(values=[4], row_splits=[0, 1]), row_splits=[0, 1]), ragged_factory(values=ragged_factory(values=[5, 6], row_splits=[0, 1, 2]), row_splits=[0, 2]), ] options = dataset_options.TensorFlowDatasetOptions(batch_size=1, shuffle=False, num_epochs=1) for i, parsed_examples_dict in enumerate( projected_tfxio.TensorFlowDataset(options)): self.assertLen(parsed_examples_dict, 1) self.assertIn("ragged", parsed_examples_dict) self.assertAllEqual(parsed_examples_dict["ragged"], expected_tensors[i])