def _ProjectTfmdSchema(self, tensor_names: List[Text]) -> schema_pb2.Schema: """Projects self._schema by the given tensor names.""" tensor_representations = self.TensorRepresentations() tensor_names = set(tensor_names) if not tensor_names.issubset(tensor_representations): raise ValueError( "Unable to project {} because they were not in the original " "TensorRepresentations.".format(tensor_names - tensor_representations)) paths = set() for tensor_name in tensor_names: paths.update( tensor_rep_util.GetSourceColumnsFromTensorRepresentation( tensor_representations[tensor_name])) result = schema_pb2.Schema() # Note: We only copy projected features into the new schema because the # coder, and ArrowSchema() only care about Schema.feature. If they start # depending on other Schema fields then those fields must also be projected. for f in self._schema.feature: if path.ColumnPath(f.name) in paths: result.feature.add().CopyFrom(f) tensor_rep_util.SetTensorRepresentationsInSchema( result, { k: v for k, v in tensor_representations.items() if k in tensor_names }) return result
def testProjection(self): """Test projecting of a TFXIO.""" schema = schema_pb2.Schema() schema.CopyFrom(_UNORDERED_SCHEMA) tensor_representations = { "string_tensor": schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name="string_feature")), "float_tensor": schema_pb2.TensorRepresentation( sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor( dense_shape=schema_pb2.FixedShape( dim=[schema_pb2.FixedShape.Dim(size=10)]), index_column_names=["int_feature"], value_column_name="float_feature")), } tensor_representation_util.SetTensorRepresentationsInSchema( schema, tensor_representations) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=schema, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) projected_tfxio = tfxio.Project(["float_tensor"]) # The projected_tfxio has the projected schema self.assertTrue(projected_tfxio.ArrowSchema().equals( _EXPECTED_PROJECTED_ARROW_SCHEMA)) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_PROJECTED_ARROW_SCHEMA) expected_schema = projected_tfxio.ArrowSchema() self.assertListEqual( record_batch.schema.names, expected_schema.names, "actual: {}; expected: {}".format(record_batch.schema.names, expected_schema.names)) self.assertListEqual( record_batch.schema.types, expected_schema.types, "actual: {}; expected: {}".format(record_batch.schema.types, expected_schema.types)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("float_tensor", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | projected_tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def _ProjectTfmdSchema(self, tensor_names: List[Text]) -> schema_pb2.Schema: """Projects self._schema by the given tensor names.""" tensor_representations = self.TensorRepresentations() tensor_names = set(tensor_names) if not tensor_names.issubset(tensor_representations): raise ValueError( "Unable to project {} because they were not in the original " "TensorRepresentations.".format(tensor_names - tensor_representations)) used_paths = set() for tensor_name in tensor_names: used_paths.update( tensor_representation_util. GetSourceColumnsFromTensorRepresentation( tensor_representations[tensor_name])) result = schema_pb2.Schema() # Note: We only copy projected features into the new schema because the # coder, and ArrowSchema() only care about Schema.feature. If they start # depending on other Schema fields then those fields must also be projected. for f in self._schema.feature: p = path.ColumnPath(f.name) if f.name == _SEQUENCE_COLUMN_NAME: if f.type != schema_pb2.STRUCT: raise ValueError( "Feature {} was expected to be of type STRUCT, but got {}" .format(f.name, f)) result_sequence_struct = schema_pb2.Feature() result_sequence_struct.CopyFrom(f) result_sequence_struct.ClearField("struct_domain") any_sequence_feature_projected = False for sf in f.struct_domain.feature: sequence_feature_path = p.child(sf.name) if sequence_feature_path in used_paths: any_sequence_feature_projected = True result_sequence_struct.struct_domain.feature.add( ).CopyFrom(sf) if any_sequence_feature_projected: result.feature.add().CopyFrom(result_sequence_struct) elif p in used_paths: result.feature.add().CopyFrom(f) tensor_representation_util.SetTensorRepresentationsInSchema( result, { k: v for k, v in tensor_representations.items() if k in tensor_names }) return result
def _ProjectTfmdSchemaTensorRepresentation( self, tensor_names: List[Text]) -> schema_pb2.Schema: """Creates the tensor representation for choosen tensor_names.""" tensor_representations = self.TensorRepresentations() tensor_names = set(tensor_names) # The columns in the schema will remain the same, because the csv decoder # will need to decode all columns no matter what. result = schema_pb2.Schema() result.CopyFrom(self._schema) # The tensor representation will only contain the projected columns, so the # output tensors will only be the projected columns. tensor_representation_util.SetTensorRepresentationsInSchema( result, {k: v for k, v in tensor_representations.items() if k in tensor_names}) return result