def ReadAndShuffleData(pcoll, filepatterns): """Read a train or test dataset from disk and shuffle it.""" # NOTE: we pass filepatterns as a tuple instead of two args, as the current # version of beam assumes that if the first arg to a ptransfrom_fn is a # string, then that string is the label. neg_filepattern, pos_filepattern = filepatterns # Read from each file pattern and create a tuple of the review text and the # correct label. negative_examples = ( pcoll | 'ReadNegativeExamples' >> textio.ReadFromText(neg_filepattern) | 'PairWithZero' >> beam.Map(lambda review: (review, 0))) positive_examples = ( pcoll | 'ReadPositiveExamples' >> textio.ReadFromText(pos_filepattern) | 'PairWithOne' >> beam.Map(lambda review: (review, 1))) all_examples = ( [negative_examples, positive_examples] | 'Merge' >> beam.Flatten()) # Shuffle the data. Note that the data does in fact contain duplicate reviews # for reasons that are unclear. This means that NUM_TRAIN_INSTANCES and # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data. # pylint: disable=no-value-for-parameter shuffled_examples = ( all_examples | 'RemoveDuplicates' >> beam.RemoveDuplicates() | 'Shuffle' >> Shuffle()) # Put the data in the format that can be accepted directly by tf.Transform. return shuffled_examples | 'MakeInstances' >> beam.Map( lambda p: {REVIEW_COLUMN: p[0], LABEL_COLUMN: p[1]})
def build_graph(self): # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building. # num_lines = 0 # for i in range(DATASET_NUM_SHARDS): # _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS) # num_lines += sum(1 for _ in open(_fname)) # _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS, # PPGRAPH_EXT) # shutil.move(_fname, _fname_marked) # if num_lines >= self.config.PPGRAPH_MAX_SAMPLES: # break # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the # transform call because we will parallelize the transform call later. We had the issue that this process # runs on a single core and tends to cause OOM issues. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # todo: maybe, I should only use train data (or percentage of train data) to build the graph raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( 'data/features' + '*' + 'shard' + '*', skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( self.data_formatter.get_ordered_columns(), self.data_formatter.get_raw_data_metadata().schema). decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ... transform_fn = ( (raw_train_data, self.data_formatter.get_raw_data_metadata()) | beam_impl.AnalyzeDataset( PreprocessingFunction().transform_to_tfrecord)) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. _ = (transform_fn | 'WriteTransformGraph' >> transform_fn_io.WriteTransformFn(TARGET_DIR)) # working dir # Run the Beam preprocessing pipeline. st = time.time() result = pipeline.run() result.wait_until_finish() self.logger.info( 'Transformation graph built and written in {:.2f} sec'.format( time.time() - st))
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K']) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = ( transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "column_names" not in _kale_directory_file_names: raise ValueError("column_names" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "column_names" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "column_names" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] column_names = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) if "schema" not in _kale_directory_file_names: raise ValueError("schema" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "schema" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "schema" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] schema = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) def to_dense(tensor): """Takes as input a SparseTensor and return a Tensor with correct default value Args: tensor: tf.SparseTensor Returns: tf.Tensor with default value """ if not isinstance(tensor, tf.sparse.SparseTensor): return tensor if tensor.dtype == tf.string: default_value = '' elif tensor.dtype == tf.float32: default_value = 0.0 elif tensor.dtype == tf.int32: default_value = 0 else: raise ValueError(f"Tensor type not recognized: {tensor.dtype}") return tf.squeeze(tf.sparse_to_dense(tensor.indices, [tensor.dense_shape[0], 1], tensor.values, default_value=default_value), axis=1) # TODO: Update to below version # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1) def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize( to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs trns_output = os.path.join(DATA_DIR, "transformed") if os.path.exists(trns_output): shutil.rmtree(trns_output) tft_input_metadata = dataset_metadata.DatasetMetadata(schema) runner = 'DirectRunner' with beam.Pipeline(runner, options=None) as p: with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')): converter = CsvCoder(column_names, tft_input_metadata.schema) # READ TRAIN DATA train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1) | 'DecodeTrainData' >> beam.Map(converter.decode)) # TRANSFORM TRAIN DATA (and get transform_fn function) transformed_dataset, transform_fn = ( (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset # SAVE TRANSFORMED TRAIN DATA _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) # READ EVAL DATA eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1) | 'DecodeEvalData' >> beam.Map(converter.decode)) # TRANSFORM EVAL DATA (using previously created transform_fn function) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_data, transformed_metadata = ( (eval_dataset, transform_fn) | beam_impl.TransformDataset()) # SAVE EVAL DATA _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema)) # SAVE transform_fn FUNCTION FOR LATER USE # TODO: check out what is the transform function (transform_fn) that came from previous step _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(trns_output)) # SAVE TRANSFORMED METADATA metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(trns_output, 'metadata')) # -----------------------DATA SAVING START--------------------------------- if "trns_output" in locals(): _kale_resource_save(trns_output, os.path.join( _kale_data_directory, "trns_output")) else: print("_kale_resource_save: `trns_output` not found.")
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), coder=tft.ExampleProtoCoder(transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), coder=tft.ExampleProtoCoder(transformed_metadata.schema)) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft.WriteTransformFn(working_dir))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ raw_data_schema = { key: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def run_transform(output_dir, schema, train_data_file, eval_data_file, project, mode, preprocessing_fn=None): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list. train_data_file: training data file pattern. eval_data_file: eval data file pattern. project: the project to run dataflow in. local: whether the job should be local or cloud. preprocessing_fn: a function used to preprocess the raw data. If not specified, a function will be automatically inferred from the schema. """ tft_input_metadata = make_tft_input_metadata(schema) temp_dir = os.path.join(output_dir, 'tmp') preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': temp_dir, 'project': project, 'extra_packages': [ 'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz' ] } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: with beam_impl.Context(temp_dir=temp_dir): names = [x['name'] for x in schema] converter = CsvCoder(names, tft_input_metadata.schema) train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'DecodeTrainData' >> beam.Map(converter.decode)) train_dataset = (train_data, tft_input_metadata) transformed_dataset, transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir)) # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, 'metadata')) _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) eval_data = (p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'DecodeEvalData' >> beam.Map(converter.decode)) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_dataset = ((eval_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_eval_data, transformed_metadata = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema))
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()
def create_transform_fn(train_data_file, working_dir): """Create a transform function that can be run on-the-fly while training Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. outputs[LABEL_KEY] = inputs[LABEL_KEY] return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the mpg data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def transform_data(train_data_file, test_data_file, working_dir, root_train_data_out, root_test_data_out, pipeline_options): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to root_train_data_out: Root of file containing transform training data root_test_data_out: Root of file containing transform test data pipeline_options: beam.pipeline.PipelineOptions defining DataFlow options """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key + '_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key]) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs def fix_comma_and_filter_third_column(line): # to avoid namespace error with DataflowRunner the import of csv is done # locacally see https://cloud.google.com/dataflow/faq#how-do-i-handle-nameerrors import csv cols = list(csv.reader([line], skipinitialspace=True))[0] return ','.join(cols[0:2] + cols[3:]) # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline(options=pipeline_options) as pipeline: tmp_dir = pipeline_options.get_all_options()['temp_location'] with beam_impl.Context(tmp_dir): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. converter = csv_coder.CsvCoder(ORDERED_COLUMNS, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasAndRemoveFiledTrainData' >> beam.Map(fix_comma_and_filter_third_column) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, root_train_data_out), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter(lambda line: line) | 'FixCommasAndRemoveFiledTestData' >> beam.Map(fix_comma_and_filter_third_column) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, root_test_data_out), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))