コード例 #1
0
    def test_unable_to_handle(self):
        with self.assertRaisesRegex(ValueError, "No handler found"):
            tensor_to_arrow.TensorsToRecordBatchConverter(
                {"sp": tf.SparseTensorSpec([None, None, None], tf.int32)})

        with self.assertRaisesRegex(ValueError, "No handler found"):
            tensor_to_arrow.TensorsToRecordBatchConverter(
                {"sp": tf.SparseTensorSpec([None, None], tf.bool)})
コード例 #2
0
ファイル: impl_helper.py プロジェクト: tensorflow/transform
def make_tensor_to_arrow_converter(
    schema: schema_pb2.Schema
) -> tensor_to_arrow.TensorsToRecordBatchConverter:
    """Constructs a `tf.Tensor` to `pa.RecordBatch` converter."""
    feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec
    type_specs = get_type_specs_from_feature_specs(feature_specs)
    return tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
コード例 #3
0
 def test_incompatible_type_spec(self):
     converter = tensor_to_arrow.TensorsToRecordBatchConverter(
         {"sp": tf.SparseTensorSpec([None, None], tf.int32)})
     with self.assertRaisesRegex(TypeError, "Expected SparseTensorSpec"):
         converter.convert({
             "sp":
             tf.SparseTensor(indices=[[0, 1]],
                             values=tf.constant([0], dtype=tf.int64),
                             dense_shape=[4, 1])
         })
コード例 #4
0
    def test_unable_to_handle_ragged(self):
        # This case is for a value tensor of bool type
        with self.assertRaisesRegex(ValueError, "No handler found"):
            tensor_to_arrow.TensorsToRecordBatchConverter({
                "sp":
                tf.RaggedTensorSpec(shape=[2, None, None],
                                    dtype=tf.bool,
                                    ragged_rank=2,
                                    row_splits_dtype=tf.int64)
            })

        # This case is for a 2D leaf values tensor.
        with self.assertRaisesRegex(ValueError, "No handler found"):
            tensor_to_arrow.TensorsToRecordBatchConverter({
                "sp":
                tf.RaggedTensorSpec(shape=[2, None, None],
                                    dtype=tf.int32,
                                    ragged_rank=1,
                                    row_splits_dtype=tf.int64)
            })
コード例 #5
0
 def setup(self):
   start = datetime.datetime.now()
   if self._shared_decode_fn_handle is not None:
     decode_fn_wrapper = self._shared_decode_fn_handle.acquire(
         lambda: _DecodeFnWrapper(self._saved_decoder_path))
     assert decode_fn_wrapper.saved_decoder_path == self._saved_decoder_path
   else:
     decode_fn_wrapper = _DecodeFnWrapper(self._saved_decoder_path)
   self._tensors_to_record_batch_converter = (
       tensor_to_arrow.TensorsToRecordBatchConverter(
           decode_fn_wrapper.output_type_specs))
   self._decode_fn = decode_fn_wrapper.decode_fn
   self._decoder_load_seconds = int(
       (datetime.datetime.now() - start).total_seconds())
コード例 #6
0
  def __init__(self,
               saved_decoder_path: Text,
               telemetry_descriptors: List[Text],
               physical_format: Text,
               use_singleton_decoder: bool,
               raw_record_column_name: Optional[Text]):
    super().__init__(
        telemetry_descriptors,
        logical_format="tensor",
        physical_format=physical_format,
        raw_record_column_name=raw_record_column_name)
    self._saved_decoder_path = saved_decoder_path
    decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
    tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter(
        decoder.output_type_specs())

    self._arrow_schema_no_raw_record_column = (
        tensor_to_arrow_converter.arrow_schema())
    self._tensor_representations = (
        tensor_to_arrow_converter.tensor_representations())
    self._use_singleton_decoder = use_singleton_decoder

    self._record_index_column_name = None
    record_index_tensor_name = decoder.record_index_tensor_name
    if record_index_tensor_name is not None:
      record_index_tensor_rep = self._tensor_representations[
          record_index_tensor_name]
      if record_index_tensor_rep.HasField("ragged_tensor"):
        assert len(record_index_tensor_rep.ragged_tensor.feature_path.step) == 1
        self._record_index_column_name = (
            record_index_tensor_rep.ragged_tensor.feature_path.step[0])
      elif record_index_tensor_rep.HasField("varlen_sparse_tensor"):
        self._record_index_column_name = (
            record_index_tensor_rep.varlen_sparse_tensor.column_name)
      else:
        raise ValueError("The record index tensor must be a RaggedTensor or a "
                         "VarLenSparseTensor, but got: {}"
                         .format(record_index_tensor_rep))

    if raw_record_column_name in self._arrow_schema_no_raw_record_column.names:
      raise ValueError("raw record column name: {} collided with an existing "
                       "column.".format(raw_record_column_name))
コード例 #7
0
 def test_relaxed_varlen_sparse_tensor(self):
     # Demonstrates that TensorAdapter(TensorsToRecordBatchConverter()) is not
     # an identity if the second dense dimension of SparseTensor is not tight.
     type_specs = {"sp": tf.SparseTensorSpec([None, None], tf.int32)}
     sp = tf.compat.v1.SparseTensorValue(values=np.array([1, 2], np.int32),
                                         indices=[[0, 0], [2, 0]],
                                         dense_shape=[4, 2])
     if tf.__version__ >= "2":
         sp = tf.SparseTensor.from_value(sp)
     converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
     rb = converter.convert({"sp": sp})
     adapter = tensor_adapter.TensorAdapter(
         tensor_adapter.TensorAdapterConfig(
             arrow_schema=converter.arrow_schema(),
             tensor_representations=converter.tensor_representations()))
     adapter_output = adapter.ToBatchTensors(
         rb, produce_eager_tensors=tf.__version__ >= "2")
     self.assertAllEqual(sp.values, adapter_output["sp"].values)
     self.assertAllEqual(sp.indices, adapter_output["sp"].indices)
     self.assertAllEqual(adapter_output["sp"].dense_shape, [4, 1])
コード例 #8
0
    def __init__(self, saved_decoder_path: Text,
                 telemetry_descriptors: List[Text], physical_format: Text,
                 raw_record_column_name: Optional[Text]):

        super(_RecordToTensorTFXIO,
              self).__init__(telemetry_descriptors,
                             logical_format="tensor",
                             physical_format=physical_format,
                             raw_record_column_name=raw_record_column_name)
        self._saved_decoder_path = saved_decoder_path
        decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path)
        tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter(
            decoder.output_type_specs())
        self._arrow_schema_no_raw_record_column = (
            tensor_to_arrow_converter.arrow_schema())
        self._tensor_representations = (
            tensor_to_arrow_converter.tensor_representations())
        if raw_record_column_name in self._arrow_schema_no_raw_record_column.names:
            raise ValueError(
                "raw record column name: {} collided with an existing "
                "column.".format(raw_record_column_name))
コード例 #9
0
        def convert_and_check(tensors, test_values_conversion):
            converter = tensor_to_arrow.TensorsToRecordBatchConverter(
                type_specs, options)

            self.assertEqual(
                {f.name: f.type
                 for f in converter.arrow_schema()}, expected_schema,
                "actual: {}".format(converter.arrow_schema()))

            canonical_expected_tensor_representations = {}
            for n, r in expected_tensor_representations.items():
                if not isinstance(r, schema_pb2.TensorRepresentation):
                    r = text_format.Parse(r, schema_pb2.TensorRepresentation())
                canonical_expected_tensor_representations[n] = r

            self.assertEqual(canonical_expected_tensor_representations,
                             converter.tensor_representations())

            rb = converter.convert(tensors)
            self.assertLen(expected_record_batch, rb.num_columns)
            for i, column in enumerate(rb):
                expected = expected_record_batch[rb.schema[i].name]
                self.assertTrue(
                    column.equals(expected),
                    "{}: actual: {}, expected: {}".format(
                        rb.schema[i].name, column, expected))
            # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity.
            adapter = tensor_adapter.TensorAdapter(
                tensor_adapter.TensorAdapterConfig(
                    arrow_schema=converter.arrow_schema(),
                    tensor_representations=converter.tensor_representations()))
            adapter_output = adapter.ToBatchTensors(
                rb, produce_eager_tensors=not test_values_conversion)
            self.assertEqual(adapter_output.keys(), tensors.keys())
            for k in adapter_output.keys():
                if "value" not in k:
                    self._assert_tensor_alike_equal(adapter_output[k],
                                                    tensors[k])
コード例 #10
0
    def test_convert(self, type_specs, expected_schema,
                     expected_tensor_representations, tensor_input,
                     expected_record_batch):
        converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)

        expected_schema = pa.schema(
            [pa.field(n, t) for n, t in sorted(expected_schema.items())])

        self.assertTrue(converter.arrow_schema().equals(expected_schema),
                        "actual: {}".format(converter.arrow_schema()))

        canonical_expected_tensor_representations = {}
        for n, r in expected_tensor_representations.items():
            if not isinstance(r, schema_pb2.TensorRepresentation):
                r = text_format.Parse(r, schema_pb2.TensorRepresentation())
            canonical_expected_tensor_representations[n] = r

        self.assertEqual(canonical_expected_tensor_representations,
                         converter.tensor_representations())

        rb = converter.convert(tensor_input)
        self.assertTrue(
            rb.equals(
                pa.record_batch(
                    [arr for _, arr in sorted(expected_record_batch.items())],
                    schema=expected_schema)))

        # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity.
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema=converter.arrow_schema(),
                tensor_representations=converter.tensor_representations()))
        adapter_output = adapter.ToBatchTensors(rb, produce_eager_tensors=True)
        self.assertEqual(adapter_output.keys(), tensor_input.keys())
        for k in adapter_output.keys():
            self._assert_tensor_alike_equal(adapter_output[k], tensor_input[k])
コード例 #11
0
def make_tensor_to_arrow_converter(
    schema: schema_pb2.Schema
) -> tensor_to_arrow.TensorsToRecordBatchConverter:
    """Constructs a `tf.Tensor` to `pa.RecordBatch` converter."""
    type_specs = {}
    feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec
    for name, feature_spec in feature_specs.items():
        if isinstance(feature_spec, tf.io.FixedLenFeature):
            type_specs[name] = tf.TensorSpec([None] + list(feature_spec.shape),
                                             feature_spec.dtype)
        elif isinstance(feature_spec, tf.io.VarLenFeature):
            type_specs[name] = tf.SparseTensorSpec([None, None],
                                                   feature_spec.dtype)
        elif isinstance(feature_spec, tf.io.SparseFeature):
            # `TensorsToRecordBatchConverter` ignores `SparseFeature`s since arbitrary
            # `SparseTensor`s are not yet supported. They are handled in
            # `convert_to_arrow`.
            # TODO(b/181868576): Handle `SparseFeature`s by the converter once the
            # support is implemented.
            pass
        else:
            raise ValueError('Invalid feature spec {}.'.format(feature_spec))

    return tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
コード例 #12
0
 def setup(self):
     self._decoder = tf_graph_record_decoder.load_decoder(
         self._saved_decoder_path)
     self._tensors_to_record_batch_converter = (
         tensor_to_arrow.TensorsToRecordBatchConverter(
             self._decoder.output_type_specs()))
コード例 #13
0
 def test_unable_to_handle_ragged(self, spec):
     with self.assertRaisesRegex(ValueError, "No handler found"):
         tensor_to_arrow.TensorsToRecordBatchConverter({"rt": spec})
コード例 #14
0
  def process(self, batched_extract: types.Extracts) -> List[types.Extracts]:
    features = batched_extract[constants.FEATURES_KEY]
    # Slice on transformed features if available.
    if (constants.TRANSFORMED_FEATURES_KEY in batched_extract and
        batched_extract[constants.TRANSFORMED_FEATURES_KEY] is not None):
      transformed_features = batched_extract[constants.TRANSFORMED_FEATURES_KEY]
      # If only one model, the output is stored without keying on model name.
      if not self._eval_config or len(self._eval_config.model_specs) == 1:
        features.update(transformed_features)
      else:
        # Models listed earlier have precedence in feature lookup.
        for spec in reversed(self._eval_config.model_specs):
          if spec.name in transformed_features:
            features.update(transformed_features[spec.name])

    tensors = util.to_tensorflow_tensors(features)
    tensor_specs = util.infer_tensor_specs(tensors)

    if _TF_MAJOR_VERSION < 2:
      # TODO(b/228456048): TFX-BSL doesn't support passing tensorflow tensors
      # for non-sparse/ragged values in TF 1.x (i.e. it only accepts np.ndarray
      # for dense) so we need to convert dense tensors to numpy.
      sess = tf.compat.v1.Session()

      def _convert_dense_to_numpy(values):  # pylint: disable=invalid-name
        if isinstance(values, Mapping):
          for k, v in values.items():
            if isinstance(v, Mapping):
              values[k] = _convert_dense_to_numpy(v)
            elif isinstance(v, tf.Tensor):
              values[k] = v.eval(session=sess)
        return values

      tensors = _convert_dense_to_numpy(tensors)

    converter = tensor_to_arrow.TensorsToRecordBatchConverter(tensor_specs)
    record_batch = converter.convert(tensors)
    sql_slice_keys = [[] for _ in range(record_batch.num_rows)]

    for query in self._cached_queries(record_batch.schema):
      # Example of result with batch size = 3:
      # result = [[[('feature', 'value_1')]],
      #           [[('feature', 'value_2')]],
      #           []
      #          ]
      result = query.Execute(record_batch)
      for row_index, row_result in enumerate(result):
        sql_slice_keys[row_index].extend([tuple(s) for s in row_result])

    # convert sql_slice_keys into a VarLenTensorValue where each row has dtype
    # object.
    dense_rows = []
    for row_slice_keys in sql_slice_keys:
      dense_rows.append(slicer_lib.slice_keys_to_numpy_array(row_slice_keys))
    varlen_sql_slice_keys = types.VarLenTensorValue.from_dense_rows(dense_rows)

    # Make a a shallow copy, so we don't mutate the original.
    batched_extract_copy = copy.copy(batched_extract)
    batched_extract_copy[constants.SLICE_KEY_TYPES_KEY] = varlen_sql_slice_keys

    self._sql_slicer_num_record_batch_schemas.update(
        self._cached_queries.cache_info().currsize)

    return [batched_extract_copy]