def test_unable_to_handle(self): with self.assertRaisesRegex(ValueError, "No handler found"): tensor_to_arrow.TensorsToRecordBatchConverter( {"sp": tf.SparseTensorSpec([None, None, None], tf.int32)}) with self.assertRaisesRegex(ValueError, "No handler found"): tensor_to_arrow.TensorsToRecordBatchConverter( {"sp": tf.SparseTensorSpec([None, None], tf.bool)})
def make_tensor_to_arrow_converter( schema: schema_pb2.Schema ) -> tensor_to_arrow.TensorsToRecordBatchConverter: """Constructs a `tf.Tensor` to `pa.RecordBatch` converter.""" feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec type_specs = get_type_specs_from_feature_specs(feature_specs) return tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
def test_incompatible_type_spec(self): converter = tensor_to_arrow.TensorsToRecordBatchConverter( {"sp": tf.SparseTensorSpec([None, None], tf.int32)}) with self.assertRaisesRegex(TypeError, "Expected SparseTensorSpec"): converter.convert({ "sp": tf.SparseTensor(indices=[[0, 1]], values=tf.constant([0], dtype=tf.int64), dense_shape=[4, 1]) })
def test_unable_to_handle_ragged(self): # This case is for a value tensor of bool type with self.assertRaisesRegex(ValueError, "No handler found"): tensor_to_arrow.TensorsToRecordBatchConverter({ "sp": tf.RaggedTensorSpec(shape=[2, None, None], dtype=tf.bool, ragged_rank=2, row_splits_dtype=tf.int64) }) # This case is for a 2D leaf values tensor. with self.assertRaisesRegex(ValueError, "No handler found"): tensor_to_arrow.TensorsToRecordBatchConverter({ "sp": tf.RaggedTensorSpec(shape=[2, None, None], dtype=tf.int32, ragged_rank=1, row_splits_dtype=tf.int64) })
def setup(self): start = datetime.datetime.now() if self._shared_decode_fn_handle is not None: decode_fn_wrapper = self._shared_decode_fn_handle.acquire( lambda: _DecodeFnWrapper(self._saved_decoder_path)) assert decode_fn_wrapper.saved_decoder_path == self._saved_decoder_path else: decode_fn_wrapper = _DecodeFnWrapper(self._saved_decoder_path) self._tensors_to_record_batch_converter = ( tensor_to_arrow.TensorsToRecordBatchConverter( decode_fn_wrapper.output_type_specs)) self._decode_fn = decode_fn_wrapper.decode_fn self._decoder_load_seconds = int( (datetime.datetime.now() - start).total_seconds())
def __init__(self, saved_decoder_path: Text, telemetry_descriptors: List[Text], physical_format: Text, use_singleton_decoder: bool, raw_record_column_name: Optional[Text]): super().__init__( telemetry_descriptors, logical_format="tensor", physical_format=physical_format, raw_record_column_name=raw_record_column_name) self._saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter( decoder.output_type_specs()) self._arrow_schema_no_raw_record_column = ( tensor_to_arrow_converter.arrow_schema()) self._tensor_representations = ( tensor_to_arrow_converter.tensor_representations()) self._use_singleton_decoder = use_singleton_decoder self._record_index_column_name = None record_index_tensor_name = decoder.record_index_tensor_name if record_index_tensor_name is not None: record_index_tensor_rep = self._tensor_representations[ record_index_tensor_name] if record_index_tensor_rep.HasField("ragged_tensor"): assert len(record_index_tensor_rep.ragged_tensor.feature_path.step) == 1 self._record_index_column_name = ( record_index_tensor_rep.ragged_tensor.feature_path.step[0]) elif record_index_tensor_rep.HasField("varlen_sparse_tensor"): self._record_index_column_name = ( record_index_tensor_rep.varlen_sparse_tensor.column_name) else: raise ValueError("The record index tensor must be a RaggedTensor or a " "VarLenSparseTensor, but got: {}" .format(record_index_tensor_rep)) if raw_record_column_name in self._arrow_schema_no_raw_record_column.names: raise ValueError("raw record column name: {} collided with an existing " "column.".format(raw_record_column_name))
def test_relaxed_varlen_sparse_tensor(self): # Demonstrates that TensorAdapter(TensorsToRecordBatchConverter()) is not # an identity if the second dense dimension of SparseTensor is not tight. type_specs = {"sp": tf.SparseTensorSpec([None, None], tf.int32)} sp = tf.compat.v1.SparseTensorValue(values=np.array([1, 2], np.int32), indices=[[0, 0], [2, 0]], dense_shape=[4, 2]) if tf.__version__ >= "2": sp = tf.SparseTensor.from_value(sp) converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs) rb = converter.convert({"sp": sp}) adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( arrow_schema=converter.arrow_schema(), tensor_representations=converter.tensor_representations())) adapter_output = adapter.ToBatchTensors( rb, produce_eager_tensors=tf.__version__ >= "2") self.assertAllEqual(sp.values, adapter_output["sp"].values) self.assertAllEqual(sp.indices, adapter_output["sp"].indices) self.assertAllEqual(adapter_output["sp"].dense_shape, [4, 1])
def __init__(self, saved_decoder_path: Text, telemetry_descriptors: List[Text], physical_format: Text, raw_record_column_name: Optional[Text]): super(_RecordToTensorTFXIO, self).__init__(telemetry_descriptors, logical_format="tensor", physical_format=physical_format, raw_record_column_name=raw_record_column_name) self._saved_decoder_path = saved_decoder_path decoder = tf_graph_record_decoder.load_decoder(saved_decoder_path) tensor_to_arrow_converter = tensor_to_arrow.TensorsToRecordBatchConverter( decoder.output_type_specs()) self._arrow_schema_no_raw_record_column = ( tensor_to_arrow_converter.arrow_schema()) self._tensor_representations = ( tensor_to_arrow_converter.tensor_representations()) if raw_record_column_name in self._arrow_schema_no_raw_record_column.names: raise ValueError( "raw record column name: {} collided with an existing " "column.".format(raw_record_column_name))
def convert_and_check(tensors, test_values_conversion): converter = tensor_to_arrow.TensorsToRecordBatchConverter( type_specs, options) self.assertEqual( {f.name: f.type for f in converter.arrow_schema()}, expected_schema, "actual: {}".format(converter.arrow_schema())) canonical_expected_tensor_representations = {} for n, r in expected_tensor_representations.items(): if not isinstance(r, schema_pb2.TensorRepresentation): r = text_format.Parse(r, schema_pb2.TensorRepresentation()) canonical_expected_tensor_representations[n] = r self.assertEqual(canonical_expected_tensor_representations, converter.tensor_representations()) rb = converter.convert(tensors) self.assertLen(expected_record_batch, rb.num_columns) for i, column in enumerate(rb): expected = expected_record_batch[rb.schema[i].name] self.assertTrue( column.equals(expected), "{}: actual: {}, expected: {}".format( rb.schema[i].name, column, expected)) # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity. adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( arrow_schema=converter.arrow_schema(), tensor_representations=converter.tensor_representations())) adapter_output = adapter.ToBatchTensors( rb, produce_eager_tensors=not test_values_conversion) self.assertEqual(adapter_output.keys(), tensors.keys()) for k in adapter_output.keys(): if "value" not in k: self._assert_tensor_alike_equal(adapter_output[k], tensors[k])
def test_convert(self, type_specs, expected_schema, expected_tensor_representations, tensor_input, expected_record_batch): converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs) expected_schema = pa.schema( [pa.field(n, t) for n, t in sorted(expected_schema.items())]) self.assertTrue(converter.arrow_schema().equals(expected_schema), "actual: {}".format(converter.arrow_schema())) canonical_expected_tensor_representations = {} for n, r in expected_tensor_representations.items(): if not isinstance(r, schema_pb2.TensorRepresentation): r = text_format.Parse(r, schema_pb2.TensorRepresentation()) canonical_expected_tensor_representations[n] = r self.assertEqual(canonical_expected_tensor_representations, converter.tensor_representations()) rb = converter.convert(tensor_input) self.assertTrue( rb.equals( pa.record_batch( [arr for _, arr in sorted(expected_record_batch.items())], schema=expected_schema))) # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity. adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( arrow_schema=converter.arrow_schema(), tensor_representations=converter.tensor_representations())) adapter_output = adapter.ToBatchTensors(rb, produce_eager_tensors=True) self.assertEqual(adapter_output.keys(), tensor_input.keys()) for k in adapter_output.keys(): self._assert_tensor_alike_equal(adapter_output[k], tensor_input[k])
def make_tensor_to_arrow_converter( schema: schema_pb2.Schema ) -> tensor_to_arrow.TensorsToRecordBatchConverter: """Constructs a `tf.Tensor` to `pa.RecordBatch` converter.""" type_specs = {} feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec for name, feature_spec in feature_specs.items(): if isinstance(feature_spec, tf.io.FixedLenFeature): type_specs[name] = tf.TensorSpec([None] + list(feature_spec.shape), feature_spec.dtype) elif isinstance(feature_spec, tf.io.VarLenFeature): type_specs[name] = tf.SparseTensorSpec([None, None], feature_spec.dtype) elif isinstance(feature_spec, tf.io.SparseFeature): # `TensorsToRecordBatchConverter` ignores `SparseFeature`s since arbitrary # `SparseTensor`s are not yet supported. They are handled in # `convert_to_arrow`. # TODO(b/181868576): Handle `SparseFeature`s by the converter once the # support is implemented. pass else: raise ValueError('Invalid feature spec {}.'.format(feature_spec)) return tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
def setup(self): self._decoder = tf_graph_record_decoder.load_decoder( self._saved_decoder_path) self._tensors_to_record_batch_converter = ( tensor_to_arrow.TensorsToRecordBatchConverter( self._decoder.output_type_specs()))
def test_unable_to_handle_ragged(self, spec): with self.assertRaisesRegex(ValueError, "No handler found"): tensor_to_arrow.TensorsToRecordBatchConverter({"rt": spec})
def process(self, batched_extract: types.Extracts) -> List[types.Extracts]: features = batched_extract[constants.FEATURES_KEY] # Slice on transformed features if available. if (constants.TRANSFORMED_FEATURES_KEY in batched_extract and batched_extract[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = batched_extract[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len(self._eval_config.model_specs) == 1: features.update(transformed_features) else: # Models listed earlier have precedence in feature lookup. for spec in reversed(self._eval_config.model_specs): if spec.name in transformed_features: features.update(transformed_features[spec.name]) tensors = util.to_tensorflow_tensors(features) tensor_specs = util.infer_tensor_specs(tensors) if _TF_MAJOR_VERSION < 2: # TODO(b/228456048): TFX-BSL doesn't support passing tensorflow tensors # for non-sparse/ragged values in TF 1.x (i.e. it only accepts np.ndarray # for dense) so we need to convert dense tensors to numpy. sess = tf.compat.v1.Session() def _convert_dense_to_numpy(values): # pylint: disable=invalid-name if isinstance(values, Mapping): for k, v in values.items(): if isinstance(v, Mapping): values[k] = _convert_dense_to_numpy(v) elif isinstance(v, tf.Tensor): values[k] = v.eval(session=sess) return values tensors = _convert_dense_to_numpy(tensors) converter = tensor_to_arrow.TensorsToRecordBatchConverter(tensor_specs) record_batch = converter.convert(tensors) sql_slice_keys = [[] for _ in range(record_batch.num_rows)] for query in self._cached_queries(record_batch.schema): # Example of result with batch size = 3: # result = [[[('feature', 'value_1')]], # [[('feature', 'value_2')]], # [] # ] result = query.Execute(record_batch) for row_index, row_result in enumerate(result): sql_slice_keys[row_index].extend([tuple(s) for s in row_result]) # convert sql_slice_keys into a VarLenTensorValue where each row has dtype # object. dense_rows = [] for row_slice_keys in sql_slice_keys: dense_rows.append(slicer_lib.slice_keys_to_numpy_array(row_slice_keys)) varlen_sql_slice_keys = types.VarLenTensorValue.from_dense_rows(dense_rows) # Make a a shallow copy, so we don't mutate the original. batched_extract_copy = copy.copy(batched_extract) batched_extract_copy[constants.SLICE_KEY_TYPES_KEY] = varlen_sql_slice_keys self._sql_slicer_num_record_batch_schemas.update( self._cached_queries.cache_info().currsize) return [batched_extract_copy]