def test_save_load_decode(self): decoder = _DecoderForTestWithRecordIndexTensorName() self.assertEqual( decoder.output_type_specs(), { "sparse_tensor": tf.SparseTensorSpec(shape=[None, None], dtype=tf.string), "ragged_tensor": tf.RaggedTensorSpec( shape=[None, None], dtype=tf.string, ragged_rank=1), "record_index": tf.RaggedTensorSpec( shape=[None, None], dtype=tf.int64, ragged_rank=1), "dense_tensor": tf.TensorSpec(shape=[None], dtype=tf.string) }) self.assertEqual(decoder.record_index_tensor_name, "record_index") tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir) loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir) self.assertEqual(loaded.record_index_tensor_name, "record_index") self._assert_type_specs_equal(decoder.output_type_specs(), loaded.output_type_specs()) records = [b"abc", b"def"] got = loaded.decode_record(records) self.assertLen(got, len(loaded.output_type_specs())) self.assertIn("sparse_tensor", got) st = got["sparse_tensor"] self.assertAllEqual(st.values, records) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) rt = got["ragged_tensor"] self.assertAllEqual(rt, tf.ragged.constant([[b"abc"], [b"def"]])) rt = got["record_index"] self.assertAllEqual(rt, tf.ragged.constant([[0], [1]])) dt = got["dense_tensor"] self.assertAllEqual(dt, records) # Also test that .record_index_tensor_name can be accessed in graph # mode. with tf.compat.v1.Graph().as_default(): self.assertFalse(tf.executing_eagerly()) loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir) self.assertEqual(loaded.record_index_tensor_name, "record_index") # Also test that the decoder's class method `save_decoder` works. new_decoder_path = (os.path.join(self._tmp_dir, "decoder_2")) decoder.save(new_decoder_path) loaded = tf_graph_record_decoder.load_decoder(new_decoder_path) self.assertEqual(loaded.record_index_tensor_name, "record_index")
def test_no_record_index_tensor_name(self): decoder = _DecoderForTesting() self.assertIsNone(decoder.record_index_tensor_name) tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir) loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir) self.assertIsNone(loaded.record_index_tensor_name) with tf.compat.v1.Graph().as_default(): self.assertFalse(tf.executing_eagerly()) loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir) self.assertIsNone(loaded.record_index_tensor_name)
def test_save_load_decode(self): decoder = _DecoderForTesting() self.assertEqual( decoder.output_type_specs(), { "sparse_tensor": tf.SparseTensorSpec(shape=[None, None], dtype=tf.string), "ragged_tensor": tf.RaggedTensorSpec( shape=[None, None], dtype=tf.string, ragged_rank=1) }) tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir) loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir) self.assertEqual(decoder.output_type_specs(), loaded.output_type_specs()) got = loaded.decode_record([b"abc", b"def"]) self.assertLen(got, len(loaded.output_type_specs())) self.assertIn("sparse_tensor", got) st = got["sparse_tensor"] self.assertAllEqual(st.values, [b"abc", b"def"]) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) rt = got["ragged_tensor"] self.assertAllEqual(rt, tf.ragged.constant([[b"abc"], [b"def"]]))
def DecodeFunction(self) -> Callable[[tf.Tensor], Dict[Text, Any]]: """Returns the decode function provided by the decoder. Returns: A TF function that takes a 1-D string tensor and returns a dict from strings to (composite) tensors. """ decoder = tf_graph_record_decoder.load_decoder(self._saved_decoder_path) return decoder.decode_record
def __init__(self, saved_decoder_path: Text): self.saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) self.output_type_specs = decoder.output_type_specs() # Store the concrete function to avoid tracing upon calling. # TF guarantees its thread-safey. self.decode_fn = decoder.decode_record.get_concrete_function() # Call the concrete function once to force optimization of the graph, as # we want that to be attributed as fixed setup cost. # Here we assume that an empty string tensor (0 record) can be successfully # decoded. _ = self.decode_fn(tf.convert_to_tensor([""], dtype=tf.string))
def _ApplyDecoderToDataset( self, dataset: tf.data.Dataset) -> tf.data.Dataset: decoder = tf_graph_record_decoder.load_decoder(self._saved_decoder_path) def _ParseFn(record): tensors_dict = decoder.decode_record(record) return { k: v for k, v in tensors_dict.items() if k in self.TensorRepresentations() } return dataset.map(_ParseFn)
def __init__(self, saved_decoder_path: Text): self.saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) self.output_type_specs = decoder.output_type_specs() # Store the concrete function to avoid tracing upon calling. # TF guarantees its thread-safey. self.decode_fn = decoder.decode_record # Call the concrete function once to force optimization of the graph, as # we want that to be attributed as fixed setup cost. try: _ = self.decode_fn(tf.constant([], shape=[0], dtype=tf.string)) except Exception: # pylint:disable=broad-except pass
def testExecutorModuleFileNotProvided(self): input_dict = {} output = standard_artifacts.DataView() output.uri = os.path.join(self._output_data_dir, 'output_data_view') output_dict = {'data_view': output} exec_properties = { 'create_decoder_func': '%s.%s' % (data_view_module.create_simple_decoder.__module__, data_view_module.create_simple_decoder.__name__), } executor = provider_executor.TfGraphDataViewProviderExecutor() executor.Do(input_dict, output_dict, exec_properties) loaded_decoder = tf_graph_record_decoder.load_decoder(output.uri) self.assertIsInstance(loaded_decoder, tf_graph_record_decoder.TFGraphRecordDecoder)
def testExecutorModuleFileProvided(self): input_dict = {} output = standard_artifacts.DataView() output.uri = os.path.join(self._output_data_dir, 'output_data_view') output_dict = {'data_view': [output]} exec_properties = { 'module_file': os.path.join(self._source_data_dir, 'module_file/data_view_module.py'), 'create_decoder_func': 'create_simple_decoder', } executor = provider_executor.TfGraphDataViewProviderExecutor() executor.Do(input_dict, output_dict, exec_properties) loaded_decoder = tf_graph_record_decoder.load_decoder(output.uri) self.assertIsInstance( loaded_decoder, tf_graph_record_decoder.LoadedDecoder)
def __init__(self, saved_decoder_path: Text, telemetry_descriptors: List[Text], physical_format: Text, use_singleton_decoder: bool, raw_record_column_name: Optional[Text]): super().__init__( telemetry_descriptors, logical_format="tensor", physical_format=physical_format, raw_record_column_name=raw_record_column_name) self._saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter( decoder.output_type_specs()) self._arrow_schema_no_raw_record_column = ( tensor_to_arrow_converter.arrow_schema()) self._tensor_representations = ( tensor_to_arrow_converter.tensor_representations()) self._use_singleton_decoder = use_singleton_decoder self._record_index_column_name = None record_index_tensor_name = decoder.record_index_tensor_name if record_index_tensor_name is not None: record_index_tensor_rep = self._tensor_representations[ record_index_tensor_name] if record_index_tensor_rep.HasField("ragged_tensor"): assert len(record_index_tensor_rep.ragged_tensor.feature_path.step) == 1 self._record_index_column_name = ( record_index_tensor_rep.ragged_tensor.feature_path.step[0]) elif record_index_tensor_rep.HasField("varlen_sparse_tensor"): self._record_index_column_name = ( record_index_tensor_rep.varlen_sparse_tensor.column_name) else: raise ValueError("The record index tensor must be a RaggedTensor or a " "VarLenSparseTensor, but got: {}" .format(record_index_tensor_rep)) if raw_record_column_name in self._arrow_schema_no_raw_record_column.names: raise ValueError("raw record column name: {} collided with an existing " "column.".format(raw_record_column_name))
def __init__(self, saved_decoder_path: Text, telemetry_descriptors: List[Text], physical_format: Text, raw_record_column_name: Optional[Text]): super(_RecordToTensorTFXIO, self).__init__(telemetry_descriptors, logical_format="tensor", physical_format=physical_format, raw_record_column_name=raw_record_column_name) self._saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter( decoder.output_type_specs()) self._arrow_schema_no_raw_record_column = ( tensor_to_arrow_converter.arrow_schema()) self._tensor_representations = ( tensor_to_arrow_converter.tensor_representations()) if raw_record_column_name in self._arrow_schema_no_raw_record_column.names: raise ValueError( "raw record column name: {} collided with an existing " "column.".format(raw_record_column_name))
def setup(self): self._decoder = tf_graph_record_decoder.load_decoder( self._saved_decoder_path) self._tensors_to_record_batch_converter = ( tensor_to_arrow.TensorsToRecordBatchConverter( self._decoder.output_type_specs()))
def TensorFlowDataset( self, options: dataset_options.TensorFlowDatasetOptions ) -> tf.data.Dataset: """Creates a TFRecordDataset that yields Tensors. The records are parsed by the decoder to create Tensors. This implementation is based on tf.data.experimental.ops.make_tf_record_dataset(). See base class (tfxio.TFXIO) for more details. Args: options: an options object for the tf.data.Dataset. See `dataset_options.TensorFlowDatasetOptions` for more details. options.batch_size is the batch size of the input records, but if the input record and the output batched tensors by the decoder are not batch-aligned (i.e. 1 input record results in 1 "row" in the output tensors), then the output may not be of the given batch size. Use dataset.unbatch().batch(desired_batch_size) to force the output batch size. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor`, `SparseTensor`, or `RaggedTensor` objects. Raises: ValueError: if label_key in the dataset option is not in the arrow schema. """ file_pattern = tf.convert_to_tensor(self._file_pattern) batch_size = options.batch_size drop_final_batch = options.drop_final_batch num_epochs = options.num_epochs shuffle = options.shuffle shuffle_buffer_size = options.shuffle_buffer_size shuffle_seed = options.shuffle_seed label_key = options.label_key compression_type = record_based_tfxio.DetectCompressionType( file_pattern) decoder = tf_graph_record_decoder.load_decoder( self._saved_decoder_path) def _ParseFn(record): # TODO(andylou): Change this once we plumb the projected columns into the # decoder itself. tensors_dict = decoder.decode_record(record) return { k: v for k, v in tensors_dict.items() if k in self._tensor_representations } dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle, seed=shuffle_seed) dataset = dataset.interleave( lambda filename: tf.data.TFRecordDataset(filename, compression_type ), num_parallel_calls=tf.data.experimental.AUTOTUNE) if shuffle: dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed) if num_epochs != 1: dataset = dataset.repeat(num_epochs) drop_final_batch = drop_final_batch or num_epochs is None dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch) dataset = dataset.map(_ParseFn) if label_key is not None: if label_key not in self.TensorRepresentations(): raise ValueError( "The `label_key` provided ({}) must be one of the following tensors" "names: {}.".format(label_key, self.TensorRepresentations().keys())) dataset = dataset.map(lambda x: (x, x.pop(label_key))) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)