Example #1
0
    def test_save_load_decode(self):
        decoder = _DecoderForTestWithRecordIndexTensorName()
        self.assertEqual(
            decoder.output_type_specs(), {
                "sparse_tensor":
                tf.SparseTensorSpec(shape=[None, None], dtype=tf.string),
                "ragged_tensor":
                tf.RaggedTensorSpec(
                    shape=[None, None], dtype=tf.string, ragged_rank=1),
                "record_index":
                tf.RaggedTensorSpec(
                    shape=[None, None], dtype=tf.int64, ragged_rank=1),
                "dense_tensor":
                tf.TensorSpec(shape=[None], dtype=tf.string)
            })
        self.assertEqual(decoder.record_index_tensor_name, "record_index")
        tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir)
        loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir)
        self.assertEqual(loaded.record_index_tensor_name, "record_index")

        self._assert_type_specs_equal(decoder.output_type_specs(),
                                      loaded.output_type_specs())

        records = [b"abc", b"def"]
        got = loaded.decode_record(records)
        self.assertLen(got, len(loaded.output_type_specs()))
        self.assertIn("sparse_tensor", got)
        st = got["sparse_tensor"]
        self.assertAllEqual(st.values, records)
        self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
        self.assertAllEqual(st.dense_shape, [2, 1])

        rt = got["ragged_tensor"]
        self.assertAllEqual(rt, tf.ragged.constant([[b"abc"], [b"def"]]))

        rt = got["record_index"]
        self.assertAllEqual(rt, tf.ragged.constant([[0], [1]]))

        dt = got["dense_tensor"]
        self.assertAllEqual(dt, records)

        # Also test that .record_index_tensor_name can be accessed in graph
        # mode.
        with tf.compat.v1.Graph().as_default():
            self.assertFalse(tf.executing_eagerly())
            loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir)
            self.assertEqual(loaded.record_index_tensor_name, "record_index")

        # Also test that the decoder's class method `save_decoder` works.
        new_decoder_path = (os.path.join(self._tmp_dir, "decoder_2"))
        decoder.save(new_decoder_path)
        loaded = tf_graph_record_decoder.load_decoder(new_decoder_path)
        self.assertEqual(loaded.record_index_tensor_name, "record_index")
Example #2
0
    def test_no_record_index_tensor_name(self):
        decoder = _DecoderForTesting()
        self.assertIsNone(decoder.record_index_tensor_name)

        tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir)
        loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir)
        self.assertIsNone(loaded.record_index_tensor_name)

        with tf.compat.v1.Graph().as_default():
            self.assertFalse(tf.executing_eagerly())
            loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir)
            self.assertIsNone(loaded.record_index_tensor_name)
    def test_save_load_decode(self):
        decoder = _DecoderForTesting()
        self.assertEqual(
            decoder.output_type_specs(), {
                "sparse_tensor":
                tf.SparseTensorSpec(shape=[None, None], dtype=tf.string),
                "ragged_tensor":
                tf.RaggedTensorSpec(
                    shape=[None, None], dtype=tf.string, ragged_rank=1)
            })
        tf_graph_record_decoder.save_decoder(decoder, self._tmp_dir)
        loaded = tf_graph_record_decoder.load_decoder(self._tmp_dir)

        self.assertEqual(decoder.output_type_specs(),
                         loaded.output_type_specs())
        got = loaded.decode_record([b"abc", b"def"])
        self.assertLen(got, len(loaded.output_type_specs()))
        self.assertIn("sparse_tensor", got)
        st = got["sparse_tensor"]
        self.assertAllEqual(st.values, [b"abc", b"def"])
        self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
        self.assertAllEqual(st.dense_shape, [2, 1])

        rt = got["ragged_tensor"]
        self.assertAllEqual(rt, tf.ragged.constant([[b"abc"], [b"def"]]))
  def DecodeFunction(self) -> Callable[[tf.Tensor], Dict[Text, Any]]:
    """Returns the decode function provided by the decoder.

    Returns:
      A TF function that takes a 1-D string tensor and returns a dict from
      strings to (composite) tensors.
    """
    decoder = tf_graph_record_decoder.load_decoder(self._saved_decoder_path)
    return decoder.decode_record
Example #5
0
 def __init__(self, saved_decoder_path: Text):
   self.saved_decoder_path = saved_decoder_path
   decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
   self.output_type_specs = decoder.output_type_specs()
   # Store the concrete function to avoid tracing upon calling.
   # TF guarantees its thread-safey.
   self.decode_fn = decoder.decode_record.get_concrete_function()
   # Call the concrete function once to force optimization of the graph, as
   # we want that to be attributed as fixed setup cost.
   # Here we assume that an empty string tensor (0 record) can be successfully
   # decoded.
   _ = self.decode_fn(tf.convert_to_tensor([""], dtype=tf.string))
Example #6
0
  def _ApplyDecoderToDataset(
      self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    decoder = tf_graph_record_decoder.load_decoder(self._saved_decoder_path)

    def _ParseFn(record):
      tensors_dict = decoder.decode_record(record)
      return {
          k: v
          for k, v in tensors_dict.items()
          if k in self.TensorRepresentations()
      }
    return dataset.map(_ParseFn)
 def __init__(self, saved_decoder_path: Text):
   self.saved_decoder_path = saved_decoder_path
   decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
   self.output_type_specs = decoder.output_type_specs()
   # Store the concrete function to avoid tracing upon calling.
   # TF guarantees its thread-safey.
   self.decode_fn = decoder.decode_record
   # Call the concrete function once to force optimization of the graph, as
   # we want that to be attributed as fixed setup cost.
   try:
     _ = self.decode_fn(tf.constant([], shape=[0], dtype=tf.string))
   except Exception:  # pylint:disable=broad-except
     pass
Example #8
0
 def testExecutorModuleFileNotProvided(self):
     input_dict = {}
     output = standard_artifacts.DataView()
     output.uri = os.path.join(self._output_data_dir, 'output_data_view')
     output_dict = {'data_view': output}
     exec_properties = {
         'create_decoder_func':
         '%s.%s' % (data_view_module.create_simple_decoder.__module__,
                    data_view_module.create_simple_decoder.__name__),
     }
     executor = provider_executor.TfGraphDataViewProviderExecutor()
     executor.Do(input_dict, output_dict, exec_properties)
     loaded_decoder = tf_graph_record_decoder.load_decoder(output.uri)
     self.assertIsInstance(loaded_decoder,
                           tf_graph_record_decoder.TFGraphRecordDecoder)
Example #9
0
 def testExecutorModuleFileProvided(self):
   input_dict = {}
   output = standard_artifacts.DataView()
   output.uri = os.path.join(self._output_data_dir, 'output_data_view')
   output_dict = {'data_view': [output]}
   exec_properties = {
       'module_file':
           os.path.join(self._source_data_dir,
                        'module_file/data_view_module.py'),
       'create_decoder_func':
           'create_simple_decoder',
   }
   executor = provider_executor.TfGraphDataViewProviderExecutor()
   executor.Do(input_dict, output_dict, exec_properties)
   loaded_decoder = tf_graph_record_decoder.load_decoder(output.uri)
   self.assertIsInstance(
       loaded_decoder, tf_graph_record_decoder.LoadedDecoder)
Example #10
0
  def __init__(self,
               saved_decoder_path: Text,
               telemetry_descriptors: List[Text],
               physical_format: Text,
               use_singleton_decoder: bool,
               raw_record_column_name: Optional[Text]):
    super().__init__(
        telemetry_descriptors,
        logical_format="tensor",
        physical_format=physical_format,
        raw_record_column_name=raw_record_column_name)
    self._saved_decoder_path = saved_decoder_path
    decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
    tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter(
        decoder.output_type_specs())

    self._arrow_schema_no_raw_record_column = (
        tensor_to_arrow_converter.arrow_schema())
    self._tensor_representations = (
        tensor_to_arrow_converter.tensor_representations())
    self._use_singleton_decoder = use_singleton_decoder

    self._record_index_column_name = None
    record_index_tensor_name = decoder.record_index_tensor_name
    if record_index_tensor_name is not None:
      record_index_tensor_rep = self._tensor_representations[
          record_index_tensor_name]
      if record_index_tensor_rep.HasField("ragged_tensor"):
        assert len(record_index_tensor_rep.ragged_tensor.feature_path.step) == 1
        self._record_index_column_name = (
            record_index_tensor_rep.ragged_tensor.feature_path.step[0])
      elif record_index_tensor_rep.HasField("varlen_sparse_tensor"):
        self._record_index_column_name = (
            record_index_tensor_rep.varlen_sparse_tensor.column_name)
      else:
        raise ValueError("The record index tensor must be a RaggedTensor or a "
                         "VarLenSparseTensor, but got: {}"
                         .format(record_index_tensor_rep))

    if raw_record_column_name in self._arrow_schema_no_raw_record_column.names:
      raise ValueError("raw record column name: {} collided with an existing "
                       "column.".format(raw_record_column_name))
Example #11
0
    def __init__(self, saved_decoder_path: Text,
                 telemetry_descriptors: List[Text], physical_format: Text,
                 raw_record_column_name: Optional[Text]):

        super(_RecordToTensorTFXIO,
              self).__init__(telemetry_descriptors,
                             logical_format="tensor",
                             physical_format=physical_format,
                             raw_record_column_name=raw_record_column_name)
        self._saved_decoder_path = saved_decoder_path
        decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
        tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter(
            decoder.output_type_specs())
        self._arrow_schema_no_raw_record_column = (
            tensor_to_arrow_converter.arrow_schema())
        self._tensor_representations = (
            tensor_to_arrow_converter.tensor_representations())
        if raw_record_column_name in self._arrow_schema_no_raw_record_column.names:
            raise ValueError(
                "raw record column name: {} collided with an existing "
                "column.".format(raw_record_column_name))
Example #12
0
 def setup(self):
     self._decoder = tf_graph_record_decoder.load_decoder(
         self._saved_decoder_path)
     self._tensors_to_record_batch_converter = (
         tensor_to_arrow.TensorsToRecordBatchConverter(
             self._decoder.output_type_specs()))
Example #13
0
    def TensorFlowDataset(
            self, options: dataset_options.TensorFlowDatasetOptions
    ) -> tf.data.Dataset:
        """Creates a TFRecordDataset that yields Tensors.

    The records are parsed by the decoder to create Tensors. This implementation
    is based on tf.data.experimental.ops.make_tf_record_dataset().

    See base class (tfxio.TFXIO) for more details.

    Args:
      options: an options object for the tf.data.Dataset. See
        `dataset_options.TensorFlowDatasetOptions` for more details.
        options.batch_size is the batch size of the input records, but if the
        input record and the output batched tensors by the decoder are not
        batch-aligned (i.e. 1 input record results in 1 "row" in the output
        tensors), then the output may not be of the given batch size. Use
        dataset.unbatch().batch(desired_batch_size) to force the output batch
        size.

    Returns:
      A dataset of `dict` elements, (or a tuple of `dict` elements and label).
      Each `dict` maps feature keys to `Tensor`, `SparseTensor`, or
      `RaggedTensor` objects.

    Raises:
      ValueError: if label_key in the dataset option is not in the arrow schema.
    """
        file_pattern = tf.convert_to_tensor(self._file_pattern)
        batch_size = options.batch_size
        drop_final_batch = options.drop_final_batch
        num_epochs = options.num_epochs
        shuffle = options.shuffle
        shuffle_buffer_size = options.shuffle_buffer_size
        shuffle_seed = options.shuffle_seed
        label_key = options.label_key
        compression_type = record_based_tfxio.DetectCompressionType(
            file_pattern)

        decoder = tf_graph_record_decoder.load_decoder(
            self._saved_decoder_path)

        def _ParseFn(record):
            # TODO(andylou): Change this once we plumb the projected columns into the
            # decoder itself.
            tensors_dict = decoder.decode_record(record)
            return {
                k: v
                for k, v in tensors_dict.items()
                if k in self._tensor_representations
            }

        dataset = tf.data.Dataset.list_files(file_pattern,
                                             shuffle=shuffle,
                                             seed=shuffle_seed)

        dataset = dataset.interleave(
            lambda filename: tf.data.TFRecordDataset(filename, compression_type
                                                     ),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if shuffle:
            dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
        if num_epochs != 1:
            dataset = dataset.repeat(num_epochs)

        drop_final_batch = drop_final_batch or num_epochs is None

        dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch)
        dataset = dataset.map(_ParseFn)

        if label_key is not None:
            if label_key not in self.TensorRepresentations():
                raise ValueError(
                    "The `label_key` provided ({}) must be one of the following tensors"
                    "names: {}.".format(label_key,
                                        self.TensorRepresentations().keys()))
            dataset = dataset.map(lambda x: (x, x.pop(label_key)))

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)