def testImplicitTensorRepresentations(self): tfxio = self._MakeTFXIO(_SCHEMA) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature" }""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature" }""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics( self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tf_example", "tfrecords_gzip")
def testProjection(self): schema = schema_pb2.Schema() schema.CopyFrom(_SCHEMA) tensor_representations = { "dense_string": text_format.Parse( """dense_tensor { column_name: "string_feature" shape { dim { size: 2 } } default_value { bytes_value: "zzz" } }""", schema_pb2.TensorRepresentation()), "varlen_string": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), "varlen_float": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature" }""", schema_pb2.TensorRepresentation()), } schema.tensor_representation_group[""].CopyFrom( schema_pb2.TensorRepresentationGroup( tensor_representation=tensor_representations)) tfxio = self._MakeTFXIO(schema) self.assertEqual(tensor_representations, tfxio.TensorRepresentations()) projected_tfxio = tfxio.Project( ["dense_string", "varlen_string", "varlen_float"]) self.assertEqual(tensor_representations, projected_tfxio.TensorRepresentations()) self.assertTrue(projected_tfxio.ArrowSchema().equals( pa.schema([ pa.field("float_feature", pa.list_(pa.float32())), pa.field("string_feature", pa.list_(pa.binary())), ]))) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self.ValidateRecordBatch(record_batch) expected_schema = projected_tfxio.ArrowSchema() self.assertTrue( record_batch.schema.equals(expected_schema), "actual: {}; expected: {}".format(record_batch.schema, expected_schema)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("dense_string", dict_of_tensors) self.assertIn("varlen_string", dict_of_tensors) self.assertIn("varlen_float", dict_of_tensors) with beam.Pipeline() as p: # Setting the betch_size to make sure only one batch is generated. record_batch_pcoll = p | projected_tfxio.BeamSource( batch_size=len(_EXAMPLES)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testProjection(self): """Test projecting of a TFXIO.""" schema = schema_pb2.Schema() schema.CopyFrom(_UNORDERED_SCHEMA) tensor_representations = { "string_tensor": schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name="string_feature")), "float_tensor": schema_pb2.TensorRepresentation( sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor( dense_shape=schema_pb2.FixedShape( dim=[schema_pb2.FixedShape.Dim(size=10)]), index_column_names=["int_feature"], value_column_name="float_feature")), } tensor_representation_util.SetTensorRepresentationsInSchema( schema, tensor_representations) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=schema, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) projected_tfxio = tfxio.Project(["float_tensor"]) # The projected_tfxio has the projected schema self.assertTrue(projected_tfxio.ArrowSchema().equals( _EXPECTED_PROJECTED_ARROW_SCHEMA)) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_PROJECTED_ARROW_SCHEMA) expected_schema = projected_tfxio.ArrowSchema() self.assertListEqual( record_batch.schema.names, expected_schema.names, "actual: {}; expected: {}".format(record_batch.schema.names, expected_schema.names)) self.assertListEqual( record_batch.schema.types, expected_schema.types, "actual: {}; expected: {}".format(record_batch.schema.types, expected_schema.types)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("float_tensor", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | projected_tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testTrackRecordTensorRepresentations(self): num_dense_tensors = 3 num_varlen_sparse_tensors = 2 num_sparse_tensors = 1 num_ragged_tensors = 4 tensor_representations = {} for i in range(num_dense_tensors): tensor_representations[f"dense{i}"] = ( schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor()) ) for i in range(num_varlen_sparse_tensors): tensor_representations[f"varlen{i}"] = ( schema_pb2.TensorRepresentation( varlen_sparse_tensor=schema_pb2.TensorRepresentation. VarLenSparseTensor())) for i in range(num_sparse_tensors): tensor_representations[f"sparse{i}"] = ( schema_pb2.TensorRepresentation( sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor( ))) for i in range(num_ragged_tensors): tensor_representations[f"ragged{i}"] = ( schema_pb2.TensorRepresentation( ragged_tensor=schema_pb2.TensorRepresentation.RaggedTensor( ))) expected_counters = { "dense_tensor": num_dense_tensors, "varlen_sparse_tensor": num_varlen_sparse_tensors, "sparse_tensor": num_sparse_tensors, "ragged_tensor": num_ragged_tensors, } with beam.Pipeline( **test_helpers.make_test_beam_pipeline_kwargs()) as p: _ = (p | beam.Create([tensor_representations]) | collection.TrackTensorRepresentations( counter_namespace="TestNamespace")) pipeline_result = p.run() result_metrics = pipeline_result.metrics() for kind, expected_count in expected_counters.items(): actual_counter = result_metrics.query( beam.metrics.metric.MetricsFilter().with_name( kind))["counters"] self.assertLen( actual_counter, 1, msg= f"Actual and expected lengths of {kind} counter are different." ) self.assertEqual( actual_counter[0].committed, expected_count, msg= f"Actual and expected values for {kind} counter are different." )
def test_simple(self, attach_raw_records): raw_record_column_name = "_raw_records" if attach_raw_records else None tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, _TELEMETRY_DESCRIPTORS, raw_record_column_name=raw_record_column_name) expected_fields = [ pa.field("st1", pa.list_(pa.binary())), pa.field("st2", pa.list_(pa.binary())), ] if attach_raw_records: raw_record_column_type = (pa.large_list(pa.large_binary()) if tfxio._can_produce_large_types else pa.list_(pa.binary())) expected_fields.append( pa.field(raw_record_column_name, raw_record_column_type)) self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)), tfxio.ArrowSchema()) self.assertEqual( tfxio.TensorRepresentations(), { "st1": text_format.Parse( """varlen_sparse_tensor { column_name: "st1" }""", schema_pb2.TensorRepresentation()), "st2": text_format.Parse( """varlen_sparse_tensor { column_name: "st2" }""", schema_pb2.TensorRepresentation()) }) tensor_adapter = tfxio.TensorAdapter() self.assertEqual(tensor_adapter.TypeSpecs(), _DecoderForTesting().output_type_specs()) def _assert_fn(list_of_rb): self.assertLen(list_of_rb, 1) rb = list_of_rb[0] self.assertTrue(rb.schema.equals(tfxio.ArrowSchema())) tensors = tensor_adapter.ToBatchTensors(rb) self.assertLen(tensors, 2) for tensor_name in ("st1", "st2"): self.assertIn(tensor_name, tensors) st = tensors[tensor_name] self.assertAllEqual(st.values, _RECORDS) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) p = beam.Pipeline() rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS)) beam_testing_util.assert_that(rb_pcoll, _assert_fn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tensor", "tfrecords_gzip")
def _InferTensorRepresentationFromSchema( schema: schema_pb2.Schema ) -> Dict[str, schema_pb2.TensorRepresentation]: """Translate a Feature proto into a TensorRepresentation proto. We apply the following rules: 1. If the feature has a fixed shape (set through Feature.shape field), then the feature must always be present ( Feature.presence.min_fraction == 1.0), and a DenseTensor representation will be produced for it. 2. Otherwise, a VarLenSparseTensor representation will be produced for it. Args: schema: a schema_pb2.Schema. Returns: A Dict mapping tensor names to their TensorRepresentations. Raises: ValueError: if the feature has a fixed shape but is not always present. """ result = {} columns_remaining = {f.name: f for f in schema.feature} sparse_tensor_repsentations, columns_remaining = ( _InferSparseTensorRepresentationsFromSchema(schema, columns_remaining)) result.update(sparse_tensor_repsentations) for feature in columns_remaining.values(): if not _ShouldIncludeFeature(feature): continue if feature.HasField("shape"): if feature.presence.min_fraction != 1: raise ValueError( "Feature {} had shape {} set but min_fraction {} != 1. Use" " value_count not shape field when min_fraction != 1.". format(feature.name, feature.shape, feature.presence.min_fraction)) logging.info("Feature %s has a shape %s. Setting to DenseTensor.", feature.name, feature.shape) result[feature.name] = schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name=feature.name, shape=feature.shape)) else: logging.info( "Feature %s has no shape. Setting to VarLenSparseTensor.", feature.name) result[feature.name] = schema_pb2.TensorRepresentation( varlen_sparse_tensor=schema_pb2.TensorRepresentation. VarLenSparseTensor(column_name=feature.name)) return result
def tensor_representation(self) -> schema_pb2.TensorRepresentation: result = schema_pb2.TensorRepresentation() for d in self._unbatched_shape: result.sparse_tensor.dense_shape.dim.add().size = d result.sparse_tensor.value_column_name = self._value_column_name result.sparse_tensor.index_column_names.extend(self._index_column_names) return result
def testGetSourceValueColumnFromTensorRepresentation( self, pbtxt, expected): self.assertEqual( path.ColumnPath(expected), tensor_representation_util. GetSourceValueColumnFromTensorRepresentation( text_format.Parse(pbtxt, schema_pb2.TensorRepresentation())))
def testCreateTfExampleParserConfig(self, tensor_representation, feature_type, tf_example, expected_feature, expected_parsed_results): tensor_representation = text_format.Parse( tensor_representation, schema_pb2.TensorRepresentation()) feature = tensor_representation_util.CreateTfExampleParserConfig( tensor_representation, feature_type) # Checks that the parser configs are correct. for actual_arg, expected_arg in zip(feature, expected_feature): self.assertAllEqual(actual_arg, expected_arg) # Checks that the parser configs can be used with tf.io.parse_example() actual_tensors = tf.io.parse_single_example(tf_example, {'feat': feature}) actual = actual_tensors['feat'] if isinstance(actual, tf.SparseTensor) or isinstance( actual, tf.compat.v1.SparseTensorValue): self.assertAllEqual(actual.values, expected_parsed_results.values) self.assertAllEqual(actual.indices, expected_parsed_results.indices) self.assertAllEqual(actual.dense_shape, expected_parsed_results.dense_shape) else: self.assertAllEqual(actual, expected_parsed_results)
def __setstate__(self, t): tensor_representations = {} for k, v in t[1].items(): r = schema_pb2.TensorRepresentation() r.ParseFromString(v) tensor_representations[k] = r self.__init__(t[0], tensor_representations, t[2])
def testPickleTensorAdapterConfig(self): config = tensor_adapter.TensorAdapterConfig( arrow_schema=pa.schema([pa.field("column1", pa.list_(pa.int32()))]), tensor_representations={ "column1": text_format.Parse( """ dense_tensor { column_name: "column1" shape { dim { size: 1 } } }""", schema_pb2.TensorRepresentation()) }, original_type_specs={ "column1": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]), "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]) }) unpickled_config = pickle.loads(pickle.dumps(config)) self.assertEqual(config.arrow_schema, unpickled_config.arrow_schema) self.assertEqual(config.tensor_representations, unpickled_config.tensor_representations) self.assertEqual(config.original_type_specs, unpickled_config.original_type_specs)
def testInferTensorRepresentationsFromSchema( self, ascii_proto, expected, generate_legacy_feature_spec=False, schema_is_mixed=False): if not _IS_LEGACY_SCHEMA and generate_legacy_feature_spec: raise self.skipTest('This test exersizes legacy inference logic, but the ' 'schema is not legacy schema.') schema = text_format.Parse(ascii_proto, schema_pb2.Schema()) if _IS_LEGACY_SCHEMA: schema.generate_legacy_feature_spec = generate_legacy_feature_spec expected_protos = { k: text_format.Parse(pbtxt, schema_pb2.TensorRepresentation()) for k, pbtxt in expected.items() } if not schema_is_mixed: self.assertEqual( expected_protos, tensor_representation_util.InferTensorRepresentationsFromSchema( schema)) self.assertEqual( expected_protos, tensor_representation_util.InferTensorRepresentationsFromMixedSchema( schema))
def TensorRepresentations(self) -> tensor_adapter.TensorRepresentations: return { self.raw_record_column_name: schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name=self.raw_record_column_name, shape=schema_pb2.FixedShape(), # scalar )) }
def testImplicitTensorRepresentations(self, use_beam_record_csv_tfxio): """Tests inferring of tensor representation.""" tfxio = self._MakeTFXIO( _COLUMN_NAMES, schema=_SCHEMA, make_beam_record_tfxio=use_beam_record_csv_tfxio) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature"}""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature"}""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = (self._MakePipelineInputs( p, use_beam_record_csv_tfxio) | tfxio.BeamSource(batch_size=len(_ROWS))) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "csv", _EXPECTED_PHYSICAL_FORMAT)
def testImplicitTensorRepresentations(self): """Tests inferring of tensor representation.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=_UNORDERED_SCHEMA, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature"}""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature"}""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def tensor_representation(self) -> schema_pb2.TensorRepresentation: result = schema_pb2.TensorRepresentation() result.ragged_tensor.feature_path.step.append(self._tensor_name) row_partition_dtype = ( schema_pb2.TensorRepresentation.RowPartitionDType.INT32 if self._row_partition_dtype == tf.int32 else schema_pb2.TensorRepresentation.RowPartitionDType.INT64) result.ragged_tensor.row_partition_dtype = row_partition_dtype return result
def testRaiseOnInvalidSparseTensorRepresentation( self, tensor_representation_textpb, arrow_schema): tensor_representation = text_format.Parse( tensor_representation_textpb, schema_pb2.TensorRepresentation()) with self.assertRaisesRegex(ValueError, "Unable to handle tensor"): tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( pa.schema( [pa.field(k, v) for k, v in arrow_schema.items()]), {"tensor": tensor_representation}))
def testImplicitTensorRepresentations(self): tfxio = self._MakeTFXIO(_SCHEMA) self.assertTrue(tfxio.ArrowSchema().equals( pa.schema([ pa.field("int_feature", pa.list_(pa.int64())), pa.field("float_feature", pa.list_(pa.float32())), pa.field("string_feature", pa.list_(pa.binary())), ]))) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature" }""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature" }""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self.ValidateRecordBatch(record_batch) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) with beam.Pipeline() as p: # Setting the betch_size to make sure only one batch is generated. record_batch_pcoll = p | tfxio.BeamSource( batch_size=len(_EXAMPLES)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testCreateTfExampleParserConfigRagged(self): feature_type = schema_pb2.INT tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "ragged_feature" } }""", schema_pb2.TensorRepresentation()) with self.assertRaisesRegex(NotImplementedError, 'TensorRepresentation: .* is not supported.'): tensor_representation_util.CreateTfExampleParserConfig( tensor_representation, feature_type)
def TensorRepresentations(self): return { # pylint: disable=g-complex-comprehension c: text_format.Parse( """ dense_tensor { column_name: "%s" shape { dim { size: 1 } } }""" % c, schema_pb2.TensorRepresentation()) for c in self._columns }
def testInputSpecsToTensorRepresentations(self): tensor_representations = model_util.input_specs_to_tensor_representations( { 'input_1': tf.TensorSpec(shape=(None, 2), dtype=tf.int64), 'input_2': tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32), 'input_3': tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32), }) dense_tensor_representation = text_format.Parse( """ dense_tensor { column_name: "input_1" shape { dim { size: 2 } } } """, schema_pb2.TensorRepresentation()) sparse_tensor_representation = text_format.Parse( """ varlen_sparse_tensor { column_name: "input_2" } """, schema_pb2.TensorRepresentation()) ragged_tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "input_3" } } """, schema_pb2.TensorRepresentation()) self.assertEqual( { 'input_1': dense_tensor_representation, 'input_2': sparse_tensor_representation, 'input_3': ragged_tensor_representation }, tensor_representations)
def _ragged_tensor_representation_from_feature_spec( spec: common_types.RaggedFeature, name: str, domains: Dict[str, common_types.DomainType] ) -> Tuple[schema_pb2.Feature, List[schema_pb2.Feature], schema_pb2.TensorRepresentation]: """Returns representation of a RaggedTensor from a feature spec. Args: spec: A tf.io.RaggedFeature feature spec. name: Feature name. domains: A dict whose keys are feature names and values are one of schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain. Returns: A tuple (value_feature, partitions_features, ragged_tensor_rep), where value_feature represents RaggedTensor values, partitions_features represent row lengths partitions and ragged_tensor_rep - ragged TensorRepresentation. Raises: ValueError: If the feature spec contains partition types different from UniformRowLength and RowLengths. """ value_feature = schema_pb2.Feature(name=spec.value_key or name) _set_type(name, value_feature, spec.dtype) _set_domain(name, value_feature, domains.get(name)) ragged_tensor = schema_pb2.TensorRepresentation.RaggedTensor( feature_path=path_pb2.Path(step=[spec.value_key or name])) partitions_features = [] for partition in spec.partitions: if isinstance(partition, tf.io.RaggedFeature.UniformRowLength): # pytype: disable=attribute-error ragged_tensor.partition.append( schema_pb2.TensorRepresentation.RaggedTensor.Partition( uniform_row_length=partition.length)) elif isinstance(partition, tf.io.RaggedFeature.RowLengths): # pytype: disable=attribute-error ragged_tensor.partition.append( schema_pb2.TensorRepresentation.RaggedTensor.Partition( row_length=partition.key)) partitions_features.append( schema_pb2.Feature(name=partition.key, type=schema_pb2.INT)) else: raise ValueError( 'RaggedFeature can only be created with UniformRowLength and ' 'RowLengths partitions.') return value_feature, partitions_features, schema_pb2.TensorRepresentation( ragged_tensor=ragged_tensor)
def testCreateTfExampleParserConfigRagged(self): feature_type = schema_pb2.INT tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "foo" step: "ragged_feature" } }""", schema_pb2.TensorRepresentation()) with self.assertRaisesRegex( ValueError, ('Parsing spec from a RaggedTensor with multiple steps in ' 'feature_path is not implemented.')): tensor_representation_util.CreateTfExampleParserConfig( tensor_representation, feature_type)
def tensor_representation(self) -> schema_pb2.TensorRepresentation: result = schema_pb2.TensorRepresentation() result.ragged_tensor.feature_path.step.append(self._tensor_name) row_partition_dtype = ( schema_pb2.TensorRepresentation.RowPartitionDType.INT32 if self._row_partition_dtype == tf.int32 else schema_pb2.TensorRepresentation.RowPartitionDType.INT64) result.ragged_tensor.row_partition_dtype = row_partition_dtype for dim in self._unbatched_shape: # Create uniform_row_length partitions only. if dim is not None: result.ragged_tensor.partition.append( schema_pb2.TensorRepresentation.RaggedTensor.Partition( uniform_row_length=dim)) return result
def testRecordBatchToTensorValuesWithTensorRepresentation(self): record_batch = pa.record_batch( [pa.array([[1, 2], [2, 3], [3, 4]]), pa.array([[0], [1], [1]])], ['feature_1', 'feature_2']) tensor_representation = schema_pb2.TensorRepresentation() tensor_representation.dense_tensor.column_name = 'feature_1' tensor_representation.dense_tensor.shape.dim.append( schema_pb2.FixedShape.Dim(size=2)) actual = util.record_batch_to_tensor_values( record_batch, {'feature_1': tensor_representation}) expected = { 'feature_1': np.array([[1, 2], [2, 3], [3, 4]]), 'feature_2': np.array([0, 1, 1]) } self.assertAllClose(actual, expected)
def testRaiseOnInvalidDefaultValue(self, value_type, default_value_pbtxt, exception_regexp): tensor_representation = text_format.Parse( """ dense_tensor { column_name: "column" shape {} }""", schema_pb2.TensorRepresentation()) tensor_representation.dense_tensor.default_value.CopyFrom( text_format.Parse(default_value_pbtxt, schema_pb2.TensorRepresentation.DefaultValue())) with self.assertRaisesRegex(ValueError, exception_regexp): tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( pa.schema([pa.field("column", pa.list_(value_type))]), {"tensor": tensor_representation}))
def test2DSparseTensor(self): tensor_representation = text_format.Parse( """ sparse_tensor { value_column_name: "values" index_column_names: ["d0", "d1"] dense_shape { dim { size: 10 } dim { size: 20 } } } """, schema_pb2.TensorRepresentation()) record_batch = pa.RecordBatch.from_arrays( [ pa.array([[1], None, [2], [3, 4, 5], []], type=pa.list_(pa.int64())), # Also test that the index column can be of an integral type other # than int64. pa.array([[9], None, [9], [7, 8, 9], []], type=pa.list_(pa.uint32())), pa.array([[0], None, [0], [0, 1, 2], []], type=pa.list_(pa.int64())) ], ["values", "d0", "d1"]) adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( record_batch.schema, {"output": tensor_representation})) converted = adapter.ToBatchTensors(record_batch) self.assertLen(converted, 1) self.assertIn("output", converted) actual_output = converted["output"] self.assertIsInstance( actual_output, (tf.SparseTensor, tf.compat.v1.SparseTensorValue)) self.assertSparseAllEqual( tf.compat.v1.SparseTensorValue(dense_shape=[5, 10, 20], indices=[[0, 9, 0], [2, 9, 0], [3, 7, 0], [3, 8, 1], [3, 9, 2]], values=tf.convert_to_tensor( [1, 2, 3, 4, 5], dtype=tf.int64)), actual_output) self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
def testOriginalTypeSpecs(self): arrow_schema = pa.schema([pa.field("column1", pa.list_(pa.int32()))]) tensor_representations = { "column1": text_format.Parse( """ dense_tensor { column_name: "column1" shape { dim { size: 1 } } }""", schema_pb2.TensorRepresentation()) } adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig(arrow_schema, tensor_representations)) self.assertLen(adapter.TypeSpecs(), 1) self.assertEqual(adapter.TypeSpecs(), adapter.OriginalTypeSpecs()) adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( arrow_schema, tensor_representations, original_type_specs={ "column1": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]), "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]) })) self.assertLen(adapter.TypeSpecs(), 1) self.assertLen(adapter.OriginalTypeSpecs(), 2) with self.assertRaisesRegex(ValueError, "original_type_specs must be a superset"): adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( arrow_schema, tensor_representations, original_type_specs={ # mismatch spec of column1 "column1": tf.TensorSpec(dtype=tf.int64, shape=[None, 1]), "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]) }))
def testInferTensorRepresentationsFromSchema( self, ascii_proto, expected, generate_legacy_feature_spec=False): # Skip a test if it's testing legacy logic but the schema is not the # legacy schema. if not _IS_LEGACY_SCHEMA and generate_legacy_feature_spec: print('Skipping test case: ', self.id(), file=sys.stderr) return schema = text_format.Parse(ascii_proto, schema_pb2.Schema()) if _IS_LEGACY_SCHEMA: schema.generate_legacy_feature_spec = generate_legacy_feature_spec expected_protos = { k: text_format.Parse(pbtxt, schema_pb2.TensorRepresentation()) for k, pbtxt in expected.items() } self.assertEqual( expected_protos, tensor_representation_util.InferTensorRepresentationsFromSchema( schema))
def testCreateTfExampleParserConfigInvalidDefaultValue(self): tensor_representation = text_format.Parse( """ dense_tensor { column_name: "dense_column" shape { dim { size: 1 } } default_value { int_value: -1 } }""", schema_pb2.TensorRepresentation()) feature_type = schema_pb2.FLOAT with self.assertRaisesRegex( ValueError, 'FeatureType:.* is incompatible with default_value:.*'): tensor_representation_util.CreateTfExampleParserConfig( tensor_representation, feature_type)