def run_transformation_pipeline(args, options): options = beam.pipeline.PipelineOptions(flags=[], **options) print("Sink transformed data files: {}".format(args.transform_test_prefix)) print("Sink transform artefacts directory: {}".format( params.TRANSFORM_ARTIFACTS_DIR)) print("Temporary directory: {}".format(params.TEMP_DIR)) print("") with beam.Pipeline(runner, options=options) as pipeline: with impl.Context(params.TEMP_DIR): raw_metadata = featurizer.create_raw_metadata() converter = tft_coders.csv_coder.CsvCoder( column_names=metadata.RAW_FEATURE_NAMES, delimiter=params.RAW_DATA_DELIMITER, schema=raw_metadata.schema) ###### analyze & transform train ######################################################### if (runner == 'DirectRunner'): print("Transform training data....") step = 'train' # Read raw train data from csv files raw_train_data = ( pipeline | '{} - Read Raw Data'.format(step) >> beam.io.textio.ReadFromText(args.raw_train_file) | '{} - Remove Empty Rows'.format(step) >> beam.Filter(lambda line: line) | '{} - FixCommasAndRemoveFiledTestData'.format(step) >> beam.Map(fix_comma_and_filter_third_column) | '{} - Decode CSV Data'.format(step) >> MapAndFilterErrors( converter.decode)) # create a train dataset from the data and schema raw_train_dataset = (raw_train_data, raw_metadata) # analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn transformed_train_dataset, transform_fn = ( raw_train_dataset | '{} - Analyze & Transform'.format(step) >> impl.AnalyzeAndTransformDataset(preprocess)) # get data and schema separately from the transformed_train_dataset transformed_train_data, transformed_metadata = transformed_train_dataset ###### transform test ################################################################## if (runner == 'DirectRunner'): print("Transform test data....") step = 'test' raw_test_data = ( pipeline | '{} - Read Raw Data'.format(step) >> beam.io.textio.ReadFromText(args.raw_test_file) | '{} - Remove Empty Lines'.format(step) >> beam.Filter(lambda line: line) | '{} - FixCommasAndRemoveFiledTestData'.format(step) >> beam.Map(fix_comma_and_filter_third_column) | '{} - Decode CSV Data'.format(step) >> MapAndFilterErrors( converter.decode)) # create a test dataset from the data and schema raw_test_dataset = (raw_test_data, raw_metadata) # transform test data based on produced transform_fn (from analyzing train_data) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | '{} - Transform'.format(step) >> impl.TransformDataset()) # get data from the transformed_test_dataset transformed_test_data, _ = transformed_test_dataset # write transformed test data to sink _ = (transformed_test_data | '{} - Write Transformed Data'.format(step) >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=args.transform_test_prefix, file_name_suffix=".tfrecords", coder=tft_coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) if runner == 'DataflowRunner': pipeline.run()
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_tsv_coder(input_schema) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) # TODO(b/34231369) Remember to eventually also save the statistics. (evaluate_dataset, evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_coder = coders.ExampleProtoCoder(train_metadata.schema) _ = ( train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) _ = ( evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def transform_data(input_handle, outfile_prefix, working_dir, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[taxi.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[taxi.LABEL_KEY] = transform.apply_function( convert_label, inputs[taxi.LABEL_KEY]) return outputs raw_feature_spec = taxi.get_raw_feature_spec() raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with beam_impl.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder() raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) raw_data |= 'CleanData' >> beam.Map(taxi.clean_raw_data_dict) transform_fn = ( (raw_data, raw_data_metadata) | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir)) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def preprocess(query, in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) def read_rawdata(p, step, test_mode): if step == 'train': selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query) else: selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query) if in_test_mode: selquery = selquery + ' LIMIT 100' #print 'Processing {} data from {}'.format(step, selquery) return (p | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True)) | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup) ) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # analyze and transform training raw_data = read_rawdata(p, 'train', in_test_mode) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = read_rawdata(p, 'eval', in_test_mode) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata'))) job = p.run()
def test_single_phase_run_twice(self): cache_location = self._make_cache_location('input_cache_1', 'output_cache_1') span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: key_cache_dir = os.path.join(cache_location.output_cache_dir, key) self.assertTrue(tf.gfile.IsDirectory(key_cache_dir)) self.assertEqual(len(tf.gfile.ListDirectory(key_cache_dir)), 6) cache_location = self._make_cache_location('output_cache_1', 'output_cache_2') with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) self.assertFalse(tf.gfile.IsDirectory(cache_location.output_cache_dir))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, force_tf_compat_v1=True, output_record_batches=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: Input data formatted in one of two ways: * A sequence of dicts whose values are one of: strings, lists of strings, numeric types or a pair of those. Must have at least one key so that we can infer the batch size, or * A sequence of pa.RecordBatch. input_metadata: One of - * DatasetMetadata describing input_data if `input_data` are dicts. * TensorAdapterConfig otherwise. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. force_tf_compat_v1: A bool. If `True`, TFT's public APIs use Tensorflow in compat.v1 mode. output_record_batches: (optional) A bool. If `True`, `TransformDataset` and `AnalyzeAndTransformDataset` output `pyarrow.RecordBatch`es; otherwise, they output instance dicts. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. """ expected_vocab_file_contents = expected_vocab_file_contents or {} # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp( prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context( temp_dir=temp_dir, desired_batch_size=desired_batch_size, force_tf_compat_v1=force_tf_compat_v1): input_data = pipeline | 'CreateInput' >> beam.Create(input_data, reshuffle=False) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=output_record_batches)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create(test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset( output_record_batches=output_record_batches)) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) transformed_data_path = os.path.join(temp_dir, 'transformed_data') if expected_data is not None: if output_record_batches: def record_batch_to_examples(data_batch): # Ignore unary pass-through features. record_batch, _ = data_batch return example_coder.RecordBatchToExamples(record_batch) encode_ptransform = beam.FlatMap(record_batch_to_examples) else: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) encode_ptransform = beam.Map(transformed_data_coder.encode) _ = ( transformed_data | encode_ptransform | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) shapes = { f.name: [s.size for s in f.shape.dim] if f.HasField('shape') else [-1] for f in transformed_metadata.schema.feature } transformed_data = [ _format_example_as_numpy_dict(e, shapes) for e in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') # assertProtoEqual has a size limit on the length of the # serialized as text strings. Therefore, we first try to use # assertProtoEqual, if that fails we try to use assertEqual, if that fails # as well then we raise the exception from assertProtoEqual. try: compare.assertProtoEqual(self, expected_metadata.schema, transformed_schema) except AssertionError as compare_exception: try: self.assertEqual(expected_metadata.schema, transformed_schema) except AssertionError: raise compare_exception for filename, file_contents in six.iteritems(expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name(filename) self.AssertVocabularyContents(full_filename, file_contents)
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def preprocess(in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) EVERY_N = 100000 else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) EVERY_N = 10000 options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'dayofweek,key'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',') }) raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()) for colname in 'hourofday,passengers'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # save the raw data metadata _ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'), pipeline=p)) # analyze and transform training raw_data = (p | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True)) | 'train_filter' >> beam.Filter(is_valid)) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = (p | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True)) | 'eval_filter' >> beam.Filter(is_valid)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. """ # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. # # Also, the dataset_metadata that is returned along with # `transformed_data` is incomplete as it does not contain the deferred # components, so we instead inspect the metadata returned along with the # transform function. temp_dir = self.get_temp_dir() with beam_impl.Context(temp_dir=temp_dir): transform_fn, transformed_metadata = ( (input_data, input_metadata) | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_data, _ = ( ((input_data, input_metadata), (transform_fn, transformed_metadata)) | 'TransformDataset' >> beam_impl.TransformDataset()) if expected_data: self.assertDataCloseOrEqual(expected_data, transformed_data) if not expected_metadata: return transformed_metadata = self._resolveDeferredMetadata( transformed_metadata) if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems(): expected_schema = expected_metadata.schema.column_schemas[k] self.assertEqual( expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual(expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata)
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "schema" not in _kale_directory_file_names: raise ValueError("schema" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "schema" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "schema" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] schema = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) if "column_names" not in _kale_directory_file_names: raise ValueError("column_names" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "column_names" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "column_names" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] column_names = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) def to_dense(tensor): """Takes as input a SparseTensor and return a Tensor with correct default value Args: tensor: tf.SparseTensor Returns: tf.Tensor with default value """ if not isinstance(tensor, tf.sparse.SparseTensor): return tensor if tensor.dtype == tf.string: default_value = '' elif tensor.dtype == tf.float32: default_value = 0.0 elif tensor.dtype == tf.int32: default_value = 0 else: raise ValueError(f"Tensor type not recognized: {tensor.dtype}") return tf.squeeze(tf.sparse_to_dense(tensor.indices, [tensor.dense_shape[0], 1], tensor.values, default_value=default_value), axis=1) # TODO: Update to below version # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1) def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize( to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs trns_output = os.path.join(DATA_DIR, "transformed") if os.path.exists(trns_output): shutil.rmtree(trns_output) tft_input_metadata = dataset_metadata.DatasetMetadata(schema) runner = 'DirectRunner' with beam.Pipeline(runner, options=None) as p: with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')): converter = CsvCoder(column_names, tft_input_metadata.schema) # READ TRAIN DATA train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1) | 'DecodeTrainData' >> beam.Map(converter.decode)) # TRANSFORM TRAIN DATA (and get transform_fn function) transformed_dataset, transform_fn = ( (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset # SAVE TRANSFORMED TRAIN DATA _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) # READ EVAL DATA eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1) | 'DecodeEvalData' >> beam.Map(converter.decode)) # TRANSFORM EVAL DATA (using previously created transform_fn function) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_data, transformed_metadata = ( (eval_dataset, transform_fn) | beam_impl.TransformDataset()) # SAVE EVAL DATA _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema)) # SAVE transform_fn FUNCTION FOR LATER USE # TODO: check out what is the transform function (transform_fn) that came from previous step _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(trns_output)) # SAVE TRANSFORMED METADATA metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(trns_output, 'metadata')) # -----------------------DATA SAVING START--------------------------------- if "trns_output" in locals(): _kale_resource_save(trns_output, os.path.join( _kale_data_directory, "trns_output")) else: print("_kale_resource_save: `trns_output` not found.")
def run(flags, pipeline_args): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis""" options = PipelineOptions(flags=[], **pipeline_args) options.view_as(WorkerOptions).machine_type = flags.machine_type temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' files = tf.gfile.Glob(flags.input_dir + "*") if not flags.cloud: files = files[0: 20] # if running locally for testing, process less files logging.warning("Number of files: " + str(len(files))) labels = get_labels_array( "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv" ) with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC)) filenames = (p | 'Create filenames' >> beam.Create(files)) nii = (filenames | 'Read NII' >> beam.Map(read_nii)) nii_with_labels = ( nii | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels))) raw_train, raw_eval, raw_test = ( nii_with_labels | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap( lambda x: x[1]) raw_eval = (raw_eval | 'FlattenEval' >> beam.FlatMap(lambda x: x[1])) raw_test = (raw_test | 'FlattenTest' >> beam.FlatMap(lambda x: x[1])) raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir) dataset_and_metadata, transform_fn = ( (raw_train, input_metadata) | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset( features.preprocess)) transform_fn = ( (raw_train, input_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn( flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Predict', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, input_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': _ = (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(os.path.join( flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) _ = t | write_label >> WriteTFRecord( dataset_type, flags.output_dir, metadata)
def preprocess(pipeline, args): input_metadata = metadata_io.read_metadata( os.path.join(args.analyze_output_dir, RAW_METADATA_DIR)) schema = json.loads( file_io.read_file_to_string( os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode()) features = json.loads( file_io.read_file_to_string( os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode()) column_names = [col['name'] for col in schema] exclude_outputs = None if not args.target: for name, transform in six.iteritems(features): if transform['transform'] == TARGET_TRANSFORM: target_name = name column_names.remove(target_name) exclude_outputs = [target_name] del input_metadata.schema.column_schemas[target_name] break if args.csv_file_pattern: coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',') raw_data = ( pipeline | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern) | 'ParseCsvData' >> beam.Map(coder.decode)) else: columns = ', '.join(column_names) query = 'SELECT {columns} FROM `{table}`'.format( columns=columns, table=args.bigquery_table) raw_data = ( pipeline | 'ReadBiqQueryData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) # Note that prepare_image_transforms does not make embeddings, it justs reads # the image files and converts them to byte stings. tft.TransformDataset() # will apply the saved model that makes the image embeddings. image_columns = image_transform_columns(features) raw_data = (raw_data | 'PreprocessTransferredLearningTransformations' >> beam.Map( prepare_image_transforms, image_columns)) if args.shuffle: raw_data = raw_data | 'ShuffleData' >> shuffle() transform_fn = (pipeline | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn( args.analyze_output_dir)) (transformed_data, transform_metadata) = (((raw_data, input_metadata), transform_fn) | 'ApplyTensorflowPreprocessingGraph' >> tft.TransformDataset(exclude_outputs)) tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(tfexample_coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, args.output_filename_prefix), file_name_suffix='.tfrecord.gz'))
def tftransform( pipeline_args, # type: List[str] temp_location, # type: str schema_file, # type: str output_dir, # type: str preprocessing_fn, # type: Any training_data=None, # type: Union[None, str] evaluation_data=None, # type: Union[None, str] transform_fn_dir=None, # type: Union[None, str] compression_type=None # type: str ): # type: (...) -> PipelineState """ Generic tf.transform pipeline that takes tf.{example, record} training and evaluation datasets and outputs transformed data together with transform function Saved Model. :param pipeline_args: un-parsed Dataflow arguments :param temp_location: temporary location for dataflow job working dir :param schema_file: path to the raw feature schema text file :param output_dir: output dir for transformed data and function :param preprocessing_fn: tf.transform preprocessing function :param training_data: path to the training data :param evaluation_data: path to the evaluation data :param transform_fn_dir: dir to previously saved transformation function to apply :param compression_type: compression type for writing of tf.records :return final state of the Beam pipeline """ assert_not_empty_string(temp_location) assert_not_empty_string(schema_file) assert_not_empty_string(output_dir) assert_not_none(preprocessing_fn) if compression_type is None: compression_type = CompressionTypes.AUTO raw_feature_spec = schema_txt_file_to_feature_spec(schema_file) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema) transformed_train_output_dir = os.path.join(output_dir, "training") transformed_eval_output_dir = os.path.join(output_dir, "evaluation") if not any(i.startswith("--job_name") for i in pipeline_args): pipeline_args.append("--job_name=tf-transform-{}-{}".format( getpass.getuser(), int(time.time()))) pipeline = beam.Pipeline(argv=pipeline_args) with beam_impl.Context(temp_dir=temp_location): if training_data is not None: # if training data is provided, transform_fn_dir will be ignored if transform_fn_dir is not None: warnings.warn( "Transform_fn_dir is ignored because training_data is provided" ) transform_fn_output = os.path.join(output_dir, "transform_fn", "saved_model.pb") if FileSystems.exists(transform_fn_output): raise ValueError("Transform fn already exists at %s!" % transform_fn_output) # compute the transform_fn and apply to the training data raw_train_data = (pipeline | "ReadTrainData" >> tfrecordio.ReadFromTFRecord( training_data, coder=raw_data_coder)) ((transformed_train_data, transformed_train_metadata), transform_fn) = ( (raw_train_data, raw_data_metadata) | ("AnalyzeAndTransformTrainData" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) ) # noqa: E501 _ = ( # noqa: F841 transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(output_dir)) transformed_train_coder = ExampleProtoCoder( transformed_train_metadata.schema) _ = ( # noqa: F841 transformed_train_data | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_train_output_dir, "part"), # noqa: E501 coder=transformed_train_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 else: if transform_fn_dir is None: raise ValueError( "Either training_data or transformed_fn needs to be provided" ) # load the transform_fn transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_fn_dir) if evaluation_data is not None: # if evaluation_data exists, apply the transform_fn to the evaluation data raw_eval_data = (pipeline | "ReadEvalData" >> tfrecordio.ReadFromTFRecord( evaluation_data, coder=raw_data_coder)) (transformed_eval_data, transformed_eval_metadata) = ( ((raw_eval_data, raw_data_metadata), transform_fn) | "TransformEvalData" >> beam_impl.TransformDataset()) transformed_eval_coder = ExampleProtoCoder( transformed_eval_metadata.schema) _ = ( # noqa: F841 transformed_eval_data | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_eval_output_dir, "part"), # noqa: E501 coder=transformed_eval_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 result = pipeline.run().wait_until_finish() return result
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary(inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first')
def testTransformSparseColumns(self): # Define a transform that takes a sparse column and a varlen column, and # returns a combination of dense, sparse, and varlen columns. def preprocessing_fn(inputs): sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1), inputs['sparse']) sparse_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['sparse']) varlen_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['varlen']) sparse_copy.schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(10)])), sch.SparseColumnRepresentation( 'val_copy', [sch.SparseIndexField('idx_copy', False)])) return { 'fixed': sparse_sum, # Schema should be inferred. 'sparse': inputs['sparse'], # Schema manually attached above. 'varlen': inputs['varlen'], # Schema should be inferred. 'sparse_copy': sparse_copy, # Schema should propagate from input. 'varlen_copy': varlen_copy # Schema should propagate from input. } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_metadata = self.toMetadata({ 'sparse': tf.SparseFeature('idx', 'val', tf.float32, 10), 'varlen': tf.VarLenFeature(tf.float32), }) input_data = [{ 'idx': [0, 1], 'val': [0., 1.], 'varlen': [0., 1.] }, { 'idx': [2, 3], 'val': [2., 3.], 'varlen': [3., 4., 5.] }, { 'idx': [4, 5], 'val': [4., 5.], 'varlen': [6., 7.] }] transformed_dataset, transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, os.path.join(self.get_temp_dir(), 'sparse'))) expected_transformed_metadata = self.toMetadata({ 'fixed': tf.FixedLenFeature(None, tf.float32, None), 'sparse': tf.SparseFeature('idx', 'val', tf.float32, 10), 'varlen': tf.VarLenFeature(tf.float32), 'sparse_copy': tf.SparseFeature('idx_copy', 'val_copy', tf.float32, 10), 'varlen_copy': tf.VarLenFeature(tf.float32) }) expected_transformed_data = [{ 'fixed': 1.0, 'idx': [0, 1], 'val': [0., 1.], 'varlen': [0., 1.], 'idx_copy': [0, 1], 'val_copy': [0., 1.], 'varlen_copy': [0., 1.] }, { 'fixed': 5.0, 'idx': [2, 3], 'val': [2., 3.], 'varlen': [3., 4., 5.], 'idx_copy': [2, 3], 'val_copy': [2., 3.], 'varlen_copy': [3., 4., 5.] }, { 'fixed': 9.0, 'idx': [4, 5], 'val': [4., 5.], 'varlen': [6., 7.], 'idx_copy': [4, 5], 'val_copy': [4., 5.], 'varlen_copy': [6., 7.] }] self.assertDatasetsEqual( transformed_dataset, (expected_transformed_data, expected_transformed_metadata)) # Take the transform function and use TransformDataset to apply it to # some eval data, and compare with expected output. eval_data = [{ 'idx': [0], 'val': [9.], 'varlen': [9.] }, { 'idx': [], 'val': [], 'varlen': [] }, { 'idx': [2, 4], 'val': [8., 7.], 'varlen': [8., 7.] }] transformed_eval_dataset = (((eval_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) expected_transformed_eval_values = [{ 'fixed': 9., 'idx': [0], 'val': [9.], 'varlen': [9.], 'idx_copy': [0], 'val_copy': [9.], 'varlen_copy': [9.] }, { 'fixed': 0., 'idx': [], 'val': [], 'varlen': [], 'idx_copy': [], 'val_copy': [], 'varlen_copy': [] }, { 'fixed': 15., 'idx': [2, 4], 'val': [8., 7.], 'varlen': [8., 7.], 'idx_copy': [2, 4], 'val_copy': [8., 7.], 'varlen_copy': [8., 7.] }] self.assertDatasetsEqual( transformed_eval_dataset, (expected_transformed_eval_values, expected_transformed_metadata))
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.uniques(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = (pipeline | 'ReadTestData' >> textio.ReadFromText( test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def run_transform(output_dir, schema, train_data_file, eval_data_file, project, mode, preprocessing_fn=None): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list. train_data_file: training data file pattern. eval_data_file: eval data file pattern. project: the project to run dataflow in. local: whether the job should be local or cloud. preprocessing_fn: a function used to preprocess the raw data. If not specified, a function will be automatically inferred from the schema. """ tft_input_metadata = make_tft_input_metadata(schema) temp_dir = os.path.join(output_dir, 'tmp') preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': temp_dir, 'project': project, 'extra_packages': [ 'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz' ] } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: with beam_impl.Context(temp_dir=temp_dir): names = [x['name'] for x in schema] converter = CsvCoder(names, tft_input_metadata.schema) train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'DecodeTrainData' >> beam.Map(converter.decode)) train_dataset = (train_data, tft_input_metadata) transformed_dataset, transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir)) # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, 'metadata')) _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) eval_data = (p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'DecodeEvalData' >> beam.Map(converter.decode)) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_dataset = ((eval_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_eval_data, transformed_metadata = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema))
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, use_tfxio=False, input_data_is_tfxio_format=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise use the old API that accepts Dicts. input_data_is_tfxio_format: If True, `input_data` and `test_data` are Arrow `RecordBatch`es and the `input_metadata` is `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data is a list of Dicts and input_metadata is a `DatasetMetadata`. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents if not use_tfxio and input_data_is_tfxio_format: raise ValueError('Unable to feed TFXIO input format to the old, ' 'non-TFXIO API.') compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size, use_tfxio=use_tfxio): input_data = pipeline | 'CreateInput' >> beam.Create( input_data, reshuffle=False) if compatibility_tfxio_needed: legacy_input_metadata = input_metadata input_data, input_metadata = self.convert_to_tfxio_api_inputs( input_data, input_metadata, label='input_data') if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) if compatibility_tfxio_needed: test_data, _ = self.convert_to_tfxio_api_inputs( test_data, legacy_input_metadata, label='test_data') transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path, project_id): """Main processing pipeline reading, processing and storing processed data. Performs the following operations: - reads data from BigQuery - adds hash key value to each row - scales data - shuffles and splits data in train / validation / test sets - oversamples train data - stores data as TFRecord - splits and stores test data into labels and features files Args: p: PCollection, initial pipeline. output_dir: string, path to directory to store output. check_path: string, path to directory to store data checks. data_size: tuple of float, ratio of data going respectively to train, validation and test sets. bq_table: string, name of table to read data from. split_data_path: string, path to directory to store train, validation and test raw datasets. project_id: string, GCP project id. Raises: ValueError: No test dataset found in pipeline output. """ train_size, validation_size, test_size = data_size data = (p | 'ReadData' >> read_data(bq_table=bq_table, project_id=project_id)) _ = data | 'StoreData' >> beam.io.WriteToText( posixpath.join(output_dir, check_path, 'processed_data.txt')) split_data = ( data | 'RandomlySplitData' >> randomly_split(train_size=train_size, validation_size=validation_size, test_size=test_size)) for k in split_data: split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo( AddHash(), label_column=constants.LABEL_COLUMN, key_column=constants.KEY_COLUMN, dtype=k) # Splits test data into features pipeline and labels pipeline. if DatasetType.TEST not in split_data: raise ValueError('No test dataset found in pipeline output.') test_data = (split_data.pop(DatasetType.TEST) | 'SplitFeaturesLabels' >> split_features_labels( constants.LABEL_COLUMN, constants.KEY_COLUMN)) # Stores test data features and labels pipeline separately. for k in test_data: _ = (test_data[k] | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps) | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText( posixpath.join( output_dir, split_data_path, 'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k)))) meta_data = dataset_metadata.DatasetMetadata(make_input_schema()) transform_fn = ( (split_data[DatasetType.TRAIN], meta_data) | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn( posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION))) _ = (meta_data | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata( posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA), pipeline=p)) transformed_metadata, transformed_data = {}, {} for k in [DatasetType.TRAIN, DatasetType.VAL]: transformed_data[k], transformed_metadata[k] = ( ((split_data[k], meta_data), transform_fn) | 'Transform{}'.format(k) >> beam_impl.TransformDataset()) transformed_data[DatasetType.TRAIN] = ( transformed_data[DatasetType.TRAIN] | 'OverSampleTraining' >> oversampling()) for k in transformed_data: _ = (transformed_data[k] | 'ShuffleData{}'.format(k) >> shuffle_data() | 'StoreData{}'.format(k) >> store_transformed_data( schema=transformed_metadata[k], path=posixpath.join(output_dir, constants.PATH_TRANSFORMED_DATA_SPLIT[k]), name=DatasetType(k).name)) for k in transformed_data: _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size( name=DatasetType(k).name, path=posixpath.join(output_dir, check_path, k.name)))
def preprocess_data(train_neg_file_pattern, train_pos_file_pattern, test_neg_file_pattern, test_pos_file_pattern, transformed_train_file_pattern, transformed_test_file_pattern, transformed_metadata_dir, raw_metadata_dir, transform_func_dir, temp_dir, vocab_size, delimiters): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) """ pipeline_name = 'DataflowRunner' options = { 'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'temp_location': temp_dir, 'project': "stone-outpost-636", 'max_num_workers': 8 } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline: # with beam_impl.Context(temp_dir=temp_dir): with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern)) test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern)) preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters) (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir)) transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_metadata | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline)) _ = (const.RAW_METADATA | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
def test_single_phase_mixed_analyzer_run_once(self): cache_location = self._make_cache_location() span_0_key = 'span-0' span_1_key = 'span-1' _write_cache('__v0__CacheableCombineAccumulate--x_1-mean_and_var--', span_0_key, [2.0, 1.0, 9.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--x-x--', span_0_key, [2.0, 4.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y_1-mean_and_var--', span_0_key, [2.0, -1.5, 6.25], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y-y--', span_0_key, [4.0, 1.0], cache_location.input_cache_dir) def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False, expected_asset_file_contents=None, test_data=None, desired_batch_size=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. expected_asset_file_contents: (optional) A dictionary from asset filenames to their expected content as a list of text lines. Values should be the expected result of calling f.readlines() on the given asset files. Asset filenames are relative to the saved model's asset directory. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. """ if expected_asset_file_contents is None: expected_asset_file_contents = {} # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = self.get_temp_dir() with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets if expected_asset_file_contents: _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: self.assertDataCloseOrEqual(expected_data, transformed_data) if expected_metadata: transformed_metadata = self._resolveDeferredMetadata( transformed_metadata) if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems( ): expected_schema = expected_metadata.schema.column_schemas[ k] self.assertEqual( expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual( expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata) for filename, file_contents in six.iteritems( expected_asset_file_contents): full_filename = os.path.join(temp_dir, transform_fn_io.TRANSFORM_FN_DIR, 'assets', filename) with tf.gfile.Open(full_filename) as f: self.assertEqual(f.readlines(), file_contents)
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ raw_data_schema = { key: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
# TFTransform based preprocessing. raw_metadata = dataset_metadata.DatasetMetadata( schema=movielens.make_examples_schema()) _ = (raw_metadata | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'raw_metadata'), pipeline)) preprocessing_fn = movielens.make_preprocessing_fn() train_features_transformed, transform_fn = ( (train_data, raw_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(preprocessing_fn)) eval_features_transformed = (((eval_data, raw_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_dataset_transformed, train_metadata = train_features_transformed training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema) _ = ( train_dataset_transformed | 'EncodeTraining' >> beam.Map(training_coder.encode) | 'ShuffleTraining' >> (_Shuffle()) # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, 'features_train'), file_name_suffix='.tfrecord.gz')) _ = (train_metadata | 'WriteTransformedMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'transformed_metadata'), pipeline)) eval_dataset_transformed, eval_metadata = eval_features_transformed
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # This is needed due to b/123895600. for a, b in six.iteritems(input_data_dict): input_data_dict[a] = p | a >> beam.Create(b) transform_fn, cache_output = ( (flat_data, input_data_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) transform_fn, second_output_cache = ( (flat_data, input_data_dict, cache_output, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.logging.warn('expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or beam.Pipeline( runner=self._makeRunner()) as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: self.assertEqual(expected_metadata, tf_transform_output.transformed_metadata) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) with tf.gfile.Open(full_filename, 'rb') as f: file_lines = f.readlines() # Store frequency case. if isinstance(file_contents[0], tuple): word_and_frequency_list = [] for content in file_lines: frequency, word = content.split(b' ', 1) word_and_frequency_list.append( (word.strip(b'\n'), float(frequency.strip(b'\n')))) expected_words, expected_frequency = zip( *word_and_frequency_list) actual_words, actual_frequency = zip(*file_contents) self.assertAllEqual(expected_words, actual_words) np.testing.assert_almost_equal(expected_frequency, actual_frequency) else: file_lines = [ content.strip(b'\n') for content in file_lines ] self.assertAllEqual(file_lines, file_contents)
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) # TODO(b/33688220) should the transform functions take shuffle as an optional # argument? # TODO(b/33688275) Should the transform functions have more user friendly # names? input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> io.WriteMetadata(os.path.join( output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released. def encode_predict_data(d): try: return predict_coder.encode(d) except Exception: # pylint: disable=broad-except # Compatibility path for tf.transform < 0.1.5 return predict_coder.encode({ k: v.encode('utf-8') if isinstance(v, unicode) else v for k, v in d.items() }) serialized_examples = ( pipeline | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map(encode_predict_data)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))