def testWriteTransformFn(self): path = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), }) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(path)) transformed_metadata_dir = os.path.join( path, transform_fn_io.TRANSFORMED_METADATA_DIR) metadata = metadata_io.read_metadata(transformed_metadata_dir) self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR) self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--cloud', type=str, help='y' ) args = parser.parse_args(argv) # Parse the arguments if args.cloud=="y": pipeline_options = get_cloud_pipeline_options() else: pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"}) with beam_impl.Context(temp_dir="gs://relation_extraction/beam"): p = beam.Pipeline(options=pipeline_options) train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery()) (test_data | "test it" >> beam.Map(printy)) train_data = (train_data, train_metadata) train_dataset, transform_fn = (train_data | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn) ) test_data = (test_data, train_metadata) test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset()) train_data, transformed_metadata = train_dataset transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) _ = (train_data | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode) | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN")) ) _ = (test_data | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode) | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST")) ) _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/")) p.run().wait_until_finish()
def testWriteTransformFn(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) # Combine test metadata with a dict of PCollections resolving futures. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def testWriteTransformFn(self): path = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = _TEST_METADATA deferred_metadata = ( pipeline | 'CreateEmptyProperties' >> beam.Create([_FUTURES_DICT])) _ = ((saved_model_dir_pcoll, (metadata, deferred_metadata)) | transform_fn_io.WriteTransformFn(path)) transformed_metadata_dir = os.path.join(path, 'transformed_metadata') metadata = metadata_io.read_metadata(transformed_metadata_dir) self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = os.path.join(path, 'transform_fn') self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def testWriteTransformFn(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), }) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def run(pipeline_options, known_args): pipeline = beam.Pipeline(options=pipeline_options) with impl.Context(known_args.transform_temp_dir): articles = ( pipeline | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern)) | 'Get Articles' >> beam.Map(get_articles) | 'Get Article' >> beam.FlatMap(lambda x: x) ) dataset = (articles, get_metadata()) transform_fn = ( dataset | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn) ) transformed_data_with_meta = ( (dataset, transform_fn) | 'Transform dataset' >> impl.TransformDataset() ) transformed_data, transformed_metadata = transformed_data_with_meta transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn( known_args.transform_export_dir) ( transformed_data | 'Convert to Insertable data' >> beam.Map(to_bq_row) | 'Write to BigQuery table' >> beam.io.WriteToBigQuery( project=known_args.bq_project, dataset=known_args.bq_dataset, table=known_args.bq_table, schema=get_bigquery_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) ) if known_args.enable_tfrecord: transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'), file_name_suffix='.tfrecords', coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)) if known_args.enable_debug: transformed_data | 'Debug Output' >> beam.io.textio.WriteToText( file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt') job = pipeline.run() if pipeline_options.get_all_options()['runner'] == 'DirectRunner': job.wait_until_finish()
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options() else: pipeline_options = None p = beam.Pipeline(options=pipeline_options) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # read data and join by key raw_data_input = (p | 'ReadInputData' >> beam.io.ReadFromText( TRAIN_INPUT_DATA, skip_header_lines=1) | 'ParseInputCSV' >> beam.Map(converter_input.decode) | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey)) raw_data_output = ( p | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA, skip_header_lines=1) | 'ParseOutputCSV' >> beam.Map(converter_output.decode) | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey)) raw_data = ((raw_data_input, raw_data_output) | 'JoinData' >> beam.CoGroupByKey() | 'RemoveKeys' >> beam.FlatMap(remove_keys)) # analyse and transform dataset raw_dataset = (raw_data, input_metadata) transformed_dataset, transform_fn = ( raw_dataset | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # save data and serialize TransformFn transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeData' >> beam.Map(transformed_data_coder.encode) | 'WriteData' >> tfrecordio.WriteToTFRecord( os.path.join(TFRECORD_DIR, 'records'))) _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR)) p.run().wait_until_finish()
def build_graph(self): # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building. # num_lines = 0 # for i in range(DATASET_NUM_SHARDS): # _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS) # num_lines += sum(1 for _ in open(_fname)) # _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS, # PPGRAPH_EXT) # shutil.move(_fname, _fname_marked) # if num_lines >= self.config.PPGRAPH_MAX_SAMPLES: # break # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the # transform call because we will parallelize the transform call later. We had the issue that this process # runs on a single core and tends to cause OOM issues. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # todo: maybe, I should only use train data (or percentage of train data) to build the graph raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( 'data/features' + '*' + 'shard' + '*', skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( self.data_formatter.get_ordered_columns(), self.data_formatter.get_raw_data_metadata().schema). decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ... transform_fn = ( (raw_train_data, self.data_formatter.get_raw_data_metadata()) | beam_impl.AnalyzeDataset( PreprocessingFunction().transform_to_tfrecord)) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. _ = (transform_fn | 'WriteTransformGraph' >> transform_fn_io.WriteTransformFn(TARGET_DIR)) # working dir # Run the Beam preprocessing pipeline. st = time.time() result = pipeline.run() result.wait_until_finish() self.logger.info( 'Transformation graph built and written in {:.2f} sec'.format( time.time() - st))
def run_tft_pipeline(args): """ This is where all the data we have available in our database is processed and transformed into Tensorflow tfrecords for later training and testing. The code runs in distributed manner automatically in the engine choosen by the `runner` argument in input. """ pipeline_options = build_pipeline_options(args) temp_tft_folder = (tempfile.mkdtemp( dir='/tmp/') if not args.tft_temp else args.tft_temp) tft_transform_folder = (tempfile.mkdtemp( dir='/tmp/') if not args.tft_transform else args.tft_transform) with beam.Pipeline(options=pipeline_options) as pipeline: with beam_impl.Context(temp_dir=temp_tft_folder): train_data = read_input_data(args, pipeline, 'train') write_total_distinct_keys_to_file(train_data, args.nitems_filename, 'sku') train_dataset = (train_data, metadata.RAW_DATA_METADATA) (train_data, transformed_train_metadata), transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(tft_transform_folder)) train_data = aggregate_transformed_data(train_data, 'train') write_tfrecords(train_data, metadata.OUTPUT_TRAIN_SCHEMA, args.output_train_filename, 'output train') test_data = read_input_data(args, pipeline, 'test') test_dataset = (test_data, metadata.RAW_DATA_METADATA) (test_data, _) = ((test_dataset, transform_fn) | beam_impl.TransformDataset()) test_data = aggregate_transformed_data(test_data, 'test') test_data = aggregate_final_test_data(train_data, test_data) write_tfrecords(test_data, metadata.OUTPUT_TEST_SCHEMA, args.output_test_filename, 'output test')
def testWriteTransformFnIsRetryable(self): tft.test_case.skip_if_external_environment( 'Retries are currently not available on this environment.') original_copy_tree_to_unique_temp_dir = ( transform_fn_io._copy_tree_to_unique_temp_dir) def mock_copy_tree_to_unique_temp_dir(source, base_temp_dir_path): """Mocks transform_fn_io._copy_tree to fail the first time it is called by this test, thus forcing a retry which should succeed.""" global _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED if not _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED: _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED = True original_copy_tree_to_unique_temp_dir(source, base_temp_dir_path) raise ArithmeticError('Some error') return original_copy_tree_to_unique_temp_dir( source, base_temp_dir_path) with self._makeTestPipeline() as pipeline: transform_output_dir = os.path.join(self.get_temp_dir(), 'output') # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_path = os.path.join(saved_model_dir, 'saved_model') with file_io.FileIO(saved_model_path, mode='w') as f: f.write('some content') saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) # Combine test metadata with a dict of PCollections resolving futures. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) with mock.patch.object(transform_fn_io, '_copy_tree_to_unique_temp_dir', mock_copy_tree_to_unique_temp_dir): _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir)) # Check temp directory created by failed run was cleaned up. self.assertEqual(2, len(file_io.list_directory(transform_output_dir)))
def testWriteTransformFnIsIdempotent(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') def mock_write_metadata_expand(unused_self, unused_metadata): raise ArithmeticError('Some error') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata, 'expand', mock_write_metadata_expand): with self.assertRaisesRegexp(ArithmeticError, 'Some error'): _ = ((saved_model_dir_pcoll, object()) | transform_fn_io.WriteTransformFn(transform_output_dir)) self.assertFalse(file_io.file_exists(transform_output_dir))
def testTransformFnExportAndImportRoundtrip(self): tranform_fn_dir = os.path.join(self.get_temp_dir(), 'export_transform_fn') metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata') with beam.Pipeline() as p: def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) columns = p | 'CreateTrainingData' >> beam.Create([{ 'x': v } for v in [4, 1, 5, 2]]) with beam_impl.Context(temp_dir=self.get_temp_dir()): _, transform_fn = ( (columns, metadata) | 'Analyze and Transform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = transform_fn | transform_fn_io.WriteTransformFn( tranform_fn_dir) _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir, pipeline=p) with beam.Pipeline() as p: transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir) metadata = p | beam_metadata_io.ReadMetadata(metadata_dir) # Run transform_columns on some eval dataset. eval_data = p | 'CreateEvalData' >> beam.Create([{ 'x': v } for v in [6, 3]]) transformed_eval_data, _ = ( ((eval_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) expected_transformed_eval_data = [{ 'x_scaled': v } for v in [1.25, 0.5]] beam_test_util.assert_that( transformed_eval_data, beam_test_util.equal_to(expected_transformed_eval_data))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "column_names" not in _kale_directory_file_names: raise ValueError("column_names" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "column_names" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "column_names" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] column_names = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) if "schema" not in _kale_directory_file_names: raise ValueError("schema" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "schema" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "schema" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] schema = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) def to_dense(tensor): """Takes as input a SparseTensor and return a Tensor with correct default value Args: tensor: tf.SparseTensor Returns: tf.Tensor with default value """ if not isinstance(tensor, tf.sparse.SparseTensor): return tensor if tensor.dtype == tf.string: default_value = '' elif tensor.dtype == tf.float32: default_value = 0.0 elif tensor.dtype == tf.int32: default_value = 0 else: raise ValueError(f"Tensor type not recognized: {tensor.dtype}") return tf.squeeze(tf.sparse_to_dense(tensor.indices, [tensor.dense_shape[0], 1], tensor.values, default_value=default_value), axis=1) # TODO: Update to below version # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1) def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize( to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs trns_output = os.path.join(DATA_DIR, "transformed") if os.path.exists(trns_output): shutil.rmtree(trns_output) tft_input_metadata = dataset_metadata.DatasetMetadata(schema) runner = 'DirectRunner' with beam.Pipeline(runner, options=None) as p: with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')): converter = CsvCoder(column_names, tft_input_metadata.schema) # READ TRAIN DATA train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1) | 'DecodeTrainData' >> beam.Map(converter.decode)) # TRANSFORM TRAIN DATA (and get transform_fn function) transformed_dataset, transform_fn = ( (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset # SAVE TRANSFORMED TRAIN DATA _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) # READ EVAL DATA eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1) | 'DecodeEvalData' >> beam.Map(converter.decode)) # TRANSFORM EVAL DATA (using previously created transform_fn function) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_data, transformed_metadata = ( (eval_dataset, transform_fn) | beam_impl.TransformDataset()) # SAVE EVAL DATA _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema)) # SAVE transform_fn FUNCTION FOR LATER USE # TODO: check out what is the transform function (transform_fn) that came from previous step _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(trns_output)) # SAVE TRANSFORMED METADATA metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(trns_output, 'metadata')) # -----------------------DATA SAVING START--------------------------------- if "trns_output" in locals(): _kale_resource_save(trns_output, os.path.join( _kale_data_directory, "trns_output")) else: print("_kale_resource_save: `trns_output` not found.")
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def tftransform( pipeline_args, # type: List[str] temp_location, # type: str schema_file, # type: str output_dir, # type: str preprocessing_fn, # type: Any training_data=None, # type: Union[None, str] evaluation_data=None, # type: Union[None, str] transform_fn_dir=None, # type: Union[None, str] compression_type=None # type: str ): # type: (...) -> PipelineState """ Generic tf.transform pipeline that takes tf.{example, record} training and evaluation datasets and outputs transformed data together with transform function Saved Model. :param pipeline_args: un-parsed Dataflow arguments :param temp_location: temporary location for dataflow job working dir :param schema_file: path to the raw feature schema text file :param output_dir: output dir for transformed data and function :param preprocessing_fn: tf.transform preprocessing function :param training_data: path to the training data :param evaluation_data: path to the evaluation data :param transform_fn_dir: dir to previously saved transformation function to apply :param compression_type: compression type for writing of tf.records :return final state of the Beam pipeline """ assert_not_empty_string(temp_location) assert_not_empty_string(schema_file) assert_not_empty_string(output_dir) assert_not_none(preprocessing_fn) if compression_type is None: compression_type = CompressionTypes.AUTO raw_feature_spec = schema_txt_file_to_feature_spec(schema_file) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema) transformed_train_output_dir = os.path.join(output_dir, "training") transformed_eval_output_dir = os.path.join(output_dir, "evaluation") if not any(i.startswith("--job_name") for i in pipeline_args): pipeline_args.append("--job_name=tf-transform-{}-{}".format( getpass.getuser(), int(time.time()))) pipeline = beam.Pipeline(argv=pipeline_args) with beam_impl.Context(temp_dir=temp_location): if training_data is not None: # if training data is provided, transform_fn_dir will be ignored if transform_fn_dir is not None: warnings.warn( "Transform_fn_dir is ignored because training_data is provided" ) transform_fn_output = os.path.join(output_dir, "transform_fn", "saved_model.pb") if FileSystems.exists(transform_fn_output): raise ValueError("Transform fn already exists at %s!" % transform_fn_output) # compute the transform_fn and apply to the training data raw_train_data = (pipeline | "ReadTrainData" >> tfrecordio.ReadFromTFRecord( training_data, coder=raw_data_coder)) ((transformed_train_data, transformed_train_metadata), transform_fn) = ( (raw_train_data, raw_data_metadata) | ("AnalyzeAndTransformTrainData" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) ) # noqa: E501 _ = ( # noqa: F841 transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(output_dir)) transformed_train_coder = ExampleProtoCoder( transformed_train_metadata.schema) _ = ( # noqa: F841 transformed_train_data | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_train_output_dir, "part"), # noqa: E501 coder=transformed_train_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 else: if transform_fn_dir is None: raise ValueError( "Either training_data or transformed_fn needs to be provided" ) # load the transform_fn transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_fn_dir) if evaluation_data is not None: # if evaluation_data exists, apply the transform_fn to the evaluation data raw_eval_data = (pipeline | "ReadEvalData" >> tfrecordio.ReadFromTFRecord( evaluation_data, coder=raw_data_coder)) (transformed_eval_data, transformed_eval_metadata) = ( ((raw_eval_data, raw_data_metadata), transform_fn) | "TransformEvalData" >> beam_impl.TransformDataset()) transformed_eval_coder = ExampleProtoCoder( transformed_eval_metadata.schema) _ = ( # noqa: F841 transformed_eval_data | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_eval_output_dir, "part"), # noqa: E501 coder=transformed_eval_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 result = pipeline.run().wait_until_finish() return result
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transform_fn_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transform_fn_dir: Directory where metadata for transform function should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = ( transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(transform_fn_dir))
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with beam_impl.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)) if transform_dir is None: transform_fn = ( (raw_data, raw_data_metadata) | ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))) else: transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.logging.warn('expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = ( expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = self.get_temp_dir() with beam_impl.Context( temp_dir=temp_dir, desired_batch_size=desired_batch_size): if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets if expected_vocab_file_contents: _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: self.assertDataCloseOrEqual(expected_data, transformed_data) if expected_metadata: # Now that the pipeline has run, transformed_metadata.deferred_metadata # should be a list containing a single DatasetMetadata with the full # metadata. assert len(transformed_metadata.deferred_metadata) == 1 transformed_metadata = transformed_metadata.deferred_metadata[0] if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems(): expected_schema = expected_metadata.schema.column_schemas[k] self.assertEqual(expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual(expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata) tf_transform_output = tft.TFTransformOutput(temp_dir) for filename, file_contents in six.iteritems(expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name(filename) with tf.gfile.Open(full_filename) as f: self.assertEqual(f.readlines(), file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.logging.warn('expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or beam.Pipeline( runner=self._makeRunner()) as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) if expected_data is not None: examples = tf.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: transformed_metadata = tf_transform_output.transformed_metadata if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems( ): expected_schema = expected_metadata.schema.column_schemas[ k] self.assertEqual( expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual( expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) with tf.gfile.Open(full_filename) as f: file_lines = f.readlines() # Store frequency case. if isinstance(file_contents[0], tuple): word_and_frequency_list = [] for content in file_lines: frequency, word = content.split(' ', 1) word_and_frequency_list.append( (word.strip('\n'), float(frequency.strip('\n')))) self.assertAllEqual( zip(*word_and_frequency_list)[0], zip(*file_contents)[0]) np.testing.assert_almost_equal( zip(*word_and_frequency_list)[1], zip(*file_contents)[1]) else: file_lines = [ content.strip('\n') for content in file_lines ] self.assertAllEqual(file_lines, file_contents)
def preprocess(in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) EVERY_N = 100000 else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) EVERY_N = 10000 options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'dayofweek,key'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',') }) raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()) for colname in 'hourofday,passengers'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # save the raw data metadata _ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'), pipeline=p)) # analyze and transform training raw_data = (p | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True)) | 'train_filter' >> beam.Filter(is_valid)) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = (p | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True)) | 'eval_filter' >> beam.Filter(is_valid)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
def run(input_feature_spec, labels, feature_extraction, feature_scaling=None, eval_percent=20.0, beam_options=None, work_dir=None): """Runs the whole preprocessing step. This runs the feature extraction PTransform, validates that the data conforms to the schema provided, normalizes the features, and splits the dataset into a training and evaluation dataset. """ # Populate optional arguments if not feature_scaling: feature_scaling = lambda inputs: inputs # Type checking if not isinstance(labels, list): raise ValueError('`labels` must be list(str). ' 'Given: {} {}'.format(labels, type(labels))) if not isinstance(feature_extraction, beam.PTransform): raise ValueError('`feature_extraction` must be {}. ' 'Given: {} {}'.format(beam.PTransform, feature_extraction, type(feature_extraction))) if not callable(feature_scaling): raise ValueError('`feature_scaling` must be callable. ' 'Given: {} {}'.format(feature_scaling, type(feature_scaling))) if beam_options and not isinstance(beam_options, PipelineOptions): raise ValueError('`beam_options` must be {}. ' 'Given: {} {}'.format(PipelineOptions, beam_options, type(beam_options))) if not work_dir: work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing') tft_temp_dir = os.path.join(work_dir, 'tft-temp') train_dataset_dir = os.path.join(work_dir, 'train-dataset') eval_dataset_dir = os.path.join(work_dir, 'eval-dataset') transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR) # if tf.gfile.Exists(transform_fn_dir): if tf.io.gfile.exists(transform_fn_dir): tf.gfile.DeleteRecursively(transform_fn_dir) # [START dataflow_molecules_create_pipeline] # Build and run a Beam Pipeline with beam.Pipeline(options=beam_options) as p, \ beam_impl.Context(temp_dir=tft_temp_dir): # [END dataflow_molecules_create_pipeline] # [START dataflow_molecules_feature_extraction] # Transform and validate the input data matches the input schema dataset = ( p | 'Feature extraction' >> feature_extraction # [END dataflow_molecules_feature_extraction] # [START dataflow_molecules_validate_inputs] | 'Validate inputs' >> beam.ParDo( ValidateInputData(input_feature_spec))) # [END dataflow_molecules_validate_inputs] # [START dataflow_molecules_analyze_and_transform_dataset] # Apply the tf.Transform preprocessing_fn input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(input_feature_spec)) dataset_and_metadata, transform_fn = ( (dataset, input_metadata) | 'Feature scaling' >> beam_impl.AnalyzeAndTransformDataset(feature_scaling)) dataset, metadata = dataset_and_metadata # [END dataflow_molecules_analyze_and_transform_dataset] # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)' train_dataset, eval_dataset = ( dataset | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] # [START dataflow_molecules_write_tfrecords] # Write the datasets as TFRecords coder = example_proto_coder.ExampleProtoCoder(metadata.schema) train_dataset_prefix = os.path.join(train_dataset_dir, 'part') _ = (train_dataset | 'Write train dataset' >> tfrecordio.WriteToTFRecord( train_dataset_prefix, coder)) eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part') _ = (eval_dataset | 'Write eval dataset' >> tfrecordio.WriteToTFRecord( eval_dataset_prefix, coder)) # Write the transform_fn _ = (transform_fn | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir)) # [END dataflow_molecules_write_tfrecords] return PreprocessData(input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*')
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K']) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = ( transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
# write validation dataset _ = ( test | 'Encode & write test -> TFRecords' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.data_dir, 'tfrecords', args.output_dir, EVAL_FILES_PATTERN), coder=transformed_data_coder, file_name_suffix='.gz', num_shards=1, compression_type=beam.io.filesystem.CompressionTypes.GZIP)) # write the transform_fn _ = ( transform_fn | 'Write transformFn' >> transform_fn_io.WriteTransformFn( os.path.join(args.data_dir, 'tfrecords', args.output_dir))) else: predictions = (prepared_samples | 'Predict' >> beam.ParDo( Predict(model_dir=os.path.join( args.data_dir, 'models', args.model_dir)))) _ = predictions | 'Print predictions' >> beam.Map(print) ''' _ = ( predictions | 'Write to BQ' >> beam.io.WriteToBigQuery( table=PRED_TABLE, schema={ 'fields': [ {'name': 'item_number', 'type': 'INTEGER', 'mode': 'REQUIRED'}, {'name': 'pred_date', 'type': 'DATE', 'mode': 'REQUIRED'},
def run_transform(output_dir, schema, train_data_file, eval_data_file, project, mode, preprocessing_fn=None): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list. train_data_file: training data file pattern. eval_data_file: eval data file pattern. project: the project to run dataflow in. local: whether the job should be local or cloud. preprocessing_fn: a function used to preprocess the raw data. If not specified, a function will be automatically inferred from the schema. """ tft_input_metadata = make_tft_input_metadata(schema) temp_dir = os.path.join(output_dir, 'tmp') preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': temp_dir, 'project': project, 'extra_packages': [ 'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz' ] } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: with beam_impl.Context(temp_dir=temp_dir): names = [x['name'] for x in schema] converter = CsvCoder(names, tft_input_metadata.schema) train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'DecodeTrainData' >> beam.Map(converter.decode)) train_dataset = (train_data, tft_input_metadata) transformed_dataset, transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir)) # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, 'metadata')) _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) eval_data = (p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'DecodeEvalData' >> beam.Map(converter.decode)) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_dataset = ((eval_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_eval_data, transformed_metadata = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, force_tf_compat_v1=True): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: Input data formatted in one of two ways: * A sequence of dicts whose values are one of: strings, lists of strings, numeric types or a pair of those. Must have at least one key so that we can infer the batch size, or * A sequence of pa.RecordBatch. input_metadata: One of - * DatasetMetadata describing input_data if `input_data` are dicts. * TensorAdapterConfig otherwise. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. force_tf_compat_v1: A `Boolean`. If `True`, TFT's public APIs use Tensorflow in compat.v1 mode. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size, force_tf_compat_v1=force_tf_compat_v1): input_data = pipeline | 'CreateInput' >> beam.Create( input_data, reshuffle=False) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) shapes = { f.name: [s.size for s in f.shape.dim] if f.HasField('shape') else [-1] for f in transformed_metadata.schema.feature } transformed_data = [ _format_example_as_numpy_dict(e, shapes) for e in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
def run(input_schema, labels, preprocessing_ptransform, full_pass_preprocessing_fn=None, eval_percent=20.0, beam_options=None, temp_dir=None, tft_temp_dir=None, train_dataset_dir=None, eval_dataset_dir=None): """Runs the whole preprocessing step. This runs the preprocessing PTransform, validates that the data conforms to the schema provided, does the full-pass preprocessing step and generates the input functions needed to train and evaluate the TensorFlow model. """ # Populate optional arguments if not full_pass_preprocessing_fn: full_pass_preprocessing_fn = lambda inputs: inputs if not temp_dir: temp_dir = tempfile.mkdtemp(prefix='tensorflow_model') if not tft_temp_dir: tft_temp_dir = os.path.join(temp_dir, 'tft_temp') if not train_dataset_dir: train_dataset_dir = os.path.join(temp_dir, 'train_dataset') if not eval_dataset_dir: eval_dataset_dir = os.path.join(temp_dir, 'eval_dataset') # Type checking if not isinstance(labels, list): raise ValueError('`labels` must be list(str). ' 'Given: {} {}'.format(labels, type(labels))) if not isinstance(preprocessing_ptransform, beam.PTransform): raise ValueError('`preprocessing_ptransform` must be {}. ' 'Given: {} {}'.format(beam.PTransform, preprocessing_ptransform, type(preprocessing_ptransform))) if not callable(full_pass_preprocessing_fn): raise ValueError('`full_pass_preprocessing_fn` must be callable. ' 'Given: {} {}'.format( full_pass_preprocessing_fn, type(full_pass_preprocessing_fn))) if beam_options and not isinstance(beam_options, PipelineOptions): raise ValueError('`beam_options` must be {}. ' 'Given: {} {}'.format(PipelineOptions, beam_options, type(beam_options))) if tf.gfile.Exists(temp_dir): tf.gfile.DeleteRecursively(temp_dir) # Build and run a Beam Pipeline input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema(input_schema)) with beam.Pipeline(options=beam_options) as p, \ beam_impl.Context(temp_dir=tft_temp_dir): # Transform and validate the input data matches the input schema dataset = (p | 'Preprocessing' >> preprocessing_ptransform | 'ValidateInputData' >> beam.ParDo( ValidateInputData(input_metadata))) # Apply the tf.Transform preprocessing_fn dataset_and_metadata, transform_fn = ( (dataset, input_metadata) | 'FullPassPreprocessing' >> beam_impl.AnalyzeAndTransformDataset(full_pass_preprocessing_fn)) dataset, metadata = dataset_and_metadata # Split the dataset into a training set and an evaluation set assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)' train_dataset, eval_dataset = ( dataset | 'SplitDataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # Write the datasets as TFRecords coder = example_proto_coder.ExampleProtoCoder(metadata.schema) train_dataset_prefix = os.path.join(train_dataset_dir, 'part') _ = (train_dataset | 'WriteTrainDataset' >> tfrecordio.WriteToTFRecord( train_dataset_prefix, coder)) eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part') _ = (eval_dataset | 'WriteEvalDataset' >> tfrecordio.WriteToTFRecord( eval_dataset_prefix, coder)) # Write the transform_fn _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(temp_dir)) return PreprocessData( labels, input_metadata.schema.as_feature_spec(), metadata.schema.as_feature_spec(), os.path.join(temp_dir, transform_fn_io.TRANSFORM_FN_DIR), train_dataset_prefix + '*', eval_dataset_prefix + '*')
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: self.assertEqual(expected_metadata, tf_transform_output.transformed_metadata) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) with tf.io.gfile.GFile(full_filename, 'rb') as f: file_lines = f.readlines() # Store frequency case. if isinstance(file_contents[0], tuple): word_and_frequency_list = [] for content in file_lines: frequency, word = content.split(b' ', 1) word_and_frequency_list.append( (word.strip(b'\n'), float(frequency.strip(b'\n')))) expected_words, expected_frequency = zip( *word_and_frequency_list) actual_words, actual_frequency = zip(*file_contents) self.assertAllEqual(expected_words, actual_words) np.testing.assert_almost_equal(expected_frequency, actual_frequency) else: file_lines = [ content.strip(b'\n') for content in file_lines ] self.assertAllEqual(file_lines, file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, use_tfxio=False, input_data_is_tfxio_format=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise use the old API that accepts Dicts. input_data_is_tfxio_format: If True, `input_data` and `test_data` are Arrow `RecordBatch`es and the `input_metadata` is `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data is a list of Dicts and input_metadata is a `DatasetMetadata`. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents if not use_tfxio and input_data_is_tfxio_format: raise ValueError('Unable to feed TFXIO input format to the old, ' 'non-TFXIO API.') compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size, use_tfxio=use_tfxio): input_data = pipeline | 'CreateInput' >> beam.Create( input_data, reshuffle=False) if compatibility_tfxio_needed: legacy_input_metadata = input_metadata input_data, input_metadata = self.convert_to_tfxio_api_inputs( input_data, input_metadata, label='input_data') if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) if compatibility_tfxio_needed: test_data, _ = self.convert_to_tfxio_api_inputs( test_data, legacy_input_metadata, label='test_data') transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
_ = norm_ts_windows_eval_data | 'Write TFrecords - eval' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=eval_tfrecord_path, file_name_suffix=".tfrecords", coder=example_proto_coder.ExampleProtoCoder( norm_ts_windows_eval_metadata.schema)) # Dump raw eval set for further tensorflow model analysis _ = ts_windows_eval | 'Write TFrecords - eval raw' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=eval_raw_tfrecord_path, file_name_suffix=".tfrecords", coder=example_proto_coder.ExampleProtoCoder( ts_windows_schema.schema)) # Dump transformation graph _ = transform_fn | 'Dump Transform Function Graph' >> transform_fn_io.WriteTransformFn( known_args.tft_artifacts_dir) # Dump parameters to be forwarded to the next pipeline step with open("/train_tfrecord_path.txt", "w") as f: f.write(train_tfrecord_path + '-*') with open("/eval_tfrecord_path.txt", "w") as f: f.write(eval_tfrecord_path + '-*') with open("/eval_raw_tfrecord_path.txt", "w") as f: f.write(eval_raw_tfrecord_path + '*') with open("/znorm_stats.txt", "w") as f: json.dump(znorm_stats, f) with open("/n_areas.txt", "w") as f:
def preprocess(query, in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) def read_rawdata(p, step, test_mode): if step == 'train': selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query) else: selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query) if in_test_mode: selquery = selquery + ' LIMIT 100' #print 'Processing {} data from {}'.format(step, selquery) return (p | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True)) | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup) ) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # analyze and transform training raw_data = read_rawdata(p, 'train', in_test_mode) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = read_rawdata(p, 'eval', in_test_mode) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata'))) job = p.run()