def expand(self, transform_fn): saved_model_dir_pcoll, metadata = transform_fn # Write metadata in non-deferred manner. Once metadata contains deferred # components, the deferred components will be written in a deferred manner # while the non-deferred components will be written in a non-deferred # manner. def safe_copy_tree(source, dest): if source == dest: raise ValueError( 'Cannot write a TransformFn to its current location.') fileio.ChannelFactory.copytree(source, dest) _ = metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( os.path.join(self._path, 'transformed_metadata'), pipeline=saved_model_dir_pcoll.pipeline) return saved_model_dir_pcoll | 'WriteTransformFn' >> beam.Map( safe_copy_tree, os.path.join(self._path, 'transform_fn'))
def testTransformFnExportAndImportRoundtrip(self): tranform_fn_dir = os.path.join(self.get_temp_dir(), 'export_transform_fn') metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata') with beam.Pipeline() as p: def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) columns = p | 'CreateTrainingData' >> beam.Create([{ 'x': v } for v in [4, 1, 5, 2]]) _, transform_fn = ( (columns, metadata) | 'Analyze and Transform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, os.path.join(self.get_temp_dir(), 'no_automaterialize'))) _ = transform_fn | transform_fn_io.WriteTransformFn( tranform_fn_dir) _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir, pipeline=p) with beam.Pipeline() as p: transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir) metadata = p | beam_metadata_io.ReadMetadata(metadata_dir) # Run transform_columns on some eval dataset. eval_data = p | 'CreateEvalData' >> beam.Create([{ 'x': v } for v in [6, 3]]) transformed_eval_data, _ = ( ((eval_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) expected_transformed_eval_data = [{ 'x_scaled': v } for v in [1.25, 0.5]] beam_test_util.assert_that( transformed_eval_data, beam_test_util.equal_to(expected_transformed_eval_data))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the cleaned data and write out as a TFRecord of Example protos. Read in the cleaned data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for cleaned test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written. """ raw_data_schema = { key: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # Update outputs of both kinds to convert from shape (batch,), i.e. a batch # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1. # This is needed so the output can be easily wrapped in `FeatureColumn`s. for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS: outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as p: # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply some # Beam transformations here, which will not be encoded in the TF graph since # we don't do the from within tf.Transform's methods (AnalyzeDataset, # TransformDataset etc.). These transformations are just to get data into # a format that the CSV converter can read, in particular removing empty # lines and removing spaces after commas. raw_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, output_dir=tempfile.mkdtemp())) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( p | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = ( transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=p))