def testWriteMetadataIsRetryable(self): tft_test_case.skip_if_external_environment( 'Retries are currently not available on this environment.') original_write_metadata = beam_metadata_io.metadata_io.write_metadata write_metadata_called_list = [] def mock_write_metadata(metadata, path): """Mocks metadata_io.write_metadata to fail the first time it is called by this test, thus forcing a retry which should succeed.""" if not write_metadata_called_list: write_metadata_called_list.append(True) original_write_metadata(metadata, path) raise ArithmeticError('Some error') return original_write_metadata(metadata, path) # Write metadata to disk using WriteMetadata PTransform. with mock.patch( 'tensorflow_transform.tf_metadata.metadata_io.write_metadata', mock_write_metadata): with self._makeTestPipeline() as pipeline: path = self.get_temp_dir() _ = (test_metadata.COMPLETE_METADATA | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def testWriteMetadataNonDeferred(self): # Write properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() _ = (_TEST_METADATA_COMPLETE | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | beam.Create([_FUTURES_DICT]) _ = ((_TEST_METADATA_WITH_FUTURES, deferred_metadata) | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def testWriteMetadataNonDeferredEmptyDict(self): # Write properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() property_pcoll = pipeline | beam.Create([{}]) _ = ((_TEST_METADATA, property_pcoll) | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def testWriteMetadataNonDeferred(self): # Write metadata to disk using WriteMetadata PTransform. with beam.Pipeline() as pipeline: path = self.get_temp_dir() _ = (test_metadata.COMPLETE_METADATA | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def expand(self, transform_fn): saved_model_dir_pcoll, metadata = transform_fn # Write metadata in non-deferred manner. Once metadata contains deferred # components, the deferred components will be written in a deferred manner # while the non-deferred components will be written in a non-deferred # manner. _ = metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( os.path.join(self._path, 'transformed_metadata'), pipeline=saved_model_dir_pcoll.pipeline) return saved_model_dir_pcoll | 'WriteTransformFn' >> beam.Map( _copy_tree, os.path.join(self._path, 'transform_fn'))
def testWriteMetadataDeferred(self): # Write metadata to disk using WriteMetadata PTransform, combining # incomplete metadata with (deferred) complete metadata. with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def expand(self, transform_fn): saved_model_dir, metadata = transform_fn pipeline = saved_model_dir.pipeline # Using a temp dir within `path` ensures that the source and dstination # paths for the rename below are in the same file system. base_temp_dir = os.path.join(self._path, 'transform_tmp') temp_metadata_path = ( metadata | 'WriteMetadataToTemp' >> beam_metadata_io.WriteMetadata( base_temp_dir, pipeline, write_to_unique_subdirectory=True)) temp_transform_fn_path = ( saved_model_dir | 'WriteTransformFnToTemp' >> beam.Map( _copy_tree_to_unique_temp_dir, base_temp_dir)) metadata_path = os.path.join( self._path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) transform_fn_path = os.path.join( self._path, tft.TFTransformOutput.TRANSFORM_FN_DIR) def publish_outputs(unused_element, metadata_source_path, transform_fn_source_path): import tensorflow as tf # pylint: disable=g-import-not-at-top if not tf.io.gfile.exists(self._path): tf.io.gfile.makedirs(self._path) tf.io.gfile.rename(metadata_source_path, metadata_path, overwrite=True) tf.io.gfile.rename(transform_fn_source_path, transform_fn_path, overwrite=True) # TODO(b/211615643): Remove the exists check once importing TFIO in S3 # addresses NotFoundError. if tf.io.gfile.exists(base_temp_dir): tf.io.gfile.rmtree(base_temp_dir) # TODO(KesterTong): Move this "must follows" logic into a tfx_bsl helper # function or into Beam. return (pipeline | 'CreateSole' >> beam.Create([None]) | 'PublishMetadataAndTransformFn' >> beam.Map( publish_outputs, metadata_source_path=beam.pvalue.AsSingleton( temp_metadata_path), transform_fn_source_path=beam.pvalue.AsSingleton( temp_transform_fn_path)))
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() # Combine _TEST_METADATA with the complete (deferred) metadata. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [_TEST_METADATA_COMPLETE]) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA, deferred_metadata) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() # Combine test metadata with a dict of PCollections resolving futures. metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), 'b': pipeline | 'CreateB' >> beam.Create([5]) }) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def expand(self, transform_fn): saved_model_dir, properties = transform_fn metadata_path = os.path.join(self._path, 'transformed_metadata') pipeline = saved_model_dir.pipeline write_metadata_done = ( properties | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( metadata_path, pipeline)) transform_fn_path = os.path.join(self._path, 'transform_fn') write_transform_fn_done = ( saved_model_dir | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path)) return (write_transform_fn_done | 'WaitOnWriteMetadataDone' >> beam.Map( lambda x, dummy: x, dummy=beam.pvalue.AsSingleton(write_metadata_done)))
def expand(self, transform_fn): saved_model_dir, metadata = transform_fn metadata_path = os.path.join(self._path, TRANSFORMED_METADATA_DIR) pipeline = saved_model_dir.pipeline write_metadata_done = ( metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( metadata_path, pipeline)) transform_fn_path = os.path.join(self._path, TRANSFORM_FN_DIR) write_transform_fn_done = ( saved_model_dir | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path)) return (write_transform_fn_done | 'WaitOnWriteMetadataDone' >> beam.Map( lambda x, dummy: x, dummy=beam.pvalue.AsSingleton(write_metadata_done)))
def testTransformFnExportAndImportRoundtrip(self): tranform_fn_dir = os.path.join(self.get_temp_dir(), 'export_transform_fn') metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata') with beam.Pipeline() as p: def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) columns = p | 'CreateTrainingData' >> beam.Create([{ 'x': v } for v in [4, 1, 5, 2]]) with beam_impl.Context(temp_dir=self.get_temp_dir()): _, transform_fn = ( (columns, metadata) | 'Analyze and Transform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = transform_fn | transform_fn_io.WriteTransformFn( tranform_fn_dir) _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir, pipeline=p) with beam.Pipeline() as p: transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir) metadata = p | beam_metadata_io.ReadMetadata(metadata_dir) # Run transform_columns on some eval dataset. eval_data = p | 'CreateEvalData' >> beam.Create([{ 'x': v } for v in [6, 3]]) transformed_eval_data, _ = ( ((eval_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) expected_transformed_eval_data = [{ 'x_scaled': v } for v in [1.25, 0.5]] beam_test_util.assert_that( transformed_eval_data, beam_test_util.equal_to(expected_transformed_eval_data))
def testWriteMetadataDeferred(self): # Write metadata to disk using WriteMetadata PTransform, combining # incomplete metadata with (deferred) complete metadata. expected_asset_map = {'key': 'value'} with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata, expected_asset_map) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) with tf.io.gfile.GFile( os.path.join(path, output_wrapper.TFTransformOutput.ASSET_MAP)) as f: asset_map = json.loads(f.read()) self.assertDictEqual(asset_map, expected_asset_map)
def expand(self, transform_fn): saved_model_dir, metadata = transform_fn metadata_path = os.path.join( self._path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) pipeline = saved_model_dir.pipeline write_metadata_done = ( metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( metadata_path, pipeline)) transform_fn_path = os.path.join( self._path, tft.TFTransformOutput.TRANSFORM_FN_DIR) write_transform_fn_done = ( saved_model_dir | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path)) # TODO(KesterTong): Move this "must follows" logic into a TFT wide helper # function or into Beam. return (write_transform_fn_done | 'WaitOnWriteMetadataDone' >> beam.Map( lambda x, dummy: x, dummy=beam.pvalue.AsSingleton(write_metadata_done)))
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] def remove_character(s, char): """Remove a character from a string. Args: s: A SparseTensor of rank 1 of type tf.string char: A string of length 1 Returns: The string `s` with the given character removed (i.e. replaced by '') """ # Hacky implementation where we split and rejoin. split = tf.string_split(s, char) rejoined = tf.reduce_join( tf.sparse_to_dense( split.indices, split.dense_shape, split.values, ''), 1) return rejoined def remove_punctuation(s): """Remove puncuation from a string. Args: s: A SparseTensor of rank 1 of type tf.string Returns: The string `s` with punctuation removed. """ for char in PUNCTUATION_CHARACTERS: s = remove_character(s, char) return s cleaned_review = tft.map(remove_punctuation, review) review_tokens = tft.map(tf.string_split, cleaned_review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) return { REVIEW_COLUMN: review_indices, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = ( transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def preprocess_data(train_neg_file_pattern, train_pos_file_pattern, test_neg_file_pattern, test_pos_file_pattern, transformed_train_file_pattern, transformed_test_file_pattern, transformed_metadata_dir, raw_metadata_dir, transform_func_dir, temp_dir, vocab_size, delimiters): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) """ pipeline_name = 'DataflowRunner' options = { 'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'temp_location': temp_dir, 'project': "stone-outpost-636", 'max_num_workers': 8 } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline: # with beam_impl.Context(temp_dir=temp_dir): with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern)) test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern)) preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters) (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir)) transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_metadata | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline)) _ = (const.RAW_METADATA | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ raw_data_schema = { key: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))