def preprocess(in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) EVERY_N = 100000 else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) EVERY_N = 10000 options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'dayofweek,key'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',') }) raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()) for colname in 'hourofday,passengers'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # save the raw data metadata _ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'), pipeline=p)) # analyze and transform training raw_data = (p | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True)) | 'train_filter' >> beam.Filter(is_valid)) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = (p | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True)) | 'eval_filter' >> beam.Filter(is_valid)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
def Transform(self, inputs, outputs, status_file): """Executes on request. This is the implementation part of transform executor. This is intended for using or extending the executor without artifact dependency. Args: inputs: A dictionary of labelled input values, including: - labels.COMPUTE_STATISTICS_LABEL: Whether compute statistics. - labels.SCHEMA_PATH_LABEL: Path to schema file. - labels.EXAMPLES_FILE_FORMAT_LABEL: Example file format, optional. - labels.EXAMPLES_DATA_FORMAT_LABEL: Example data format. - labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: Paths or path patterns to analyze and transform data. - labels.TRANSFORM_DATA_PATHS_LABEL: Paths or path patterns to transform only data. - labels.TFT_STATISTICS_USE_TFDV_LABEL: Whether use tfdv to compute statistics. - labels.PREPROCESSING_FN: Path to a Python module that contains the preprocessing_fn, optional. outputs: A dictionary of labelled output values, including: - labels.PER_SET_STATS_OUTPUT_PATHS_LABEL: Paths to statistics output, optional. - labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: A path to TFTransformOutput output. - labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: Paths to transform materialization. - labels.TEMP_OUTPUT_LABEL: A path to temporary directory. status_file: Where the status should be written (not yet implemented) """ del status_file # unused compute_statistics = common.GetSoleValue( inputs, labels.COMPUTE_STATISTICS_LABEL) transform_output_path = common.GetSoleValue( outputs, labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL) raw_examples_data_format = common.GetSoleValue( inputs, labels.EXAMPLES_DATA_FORMAT_LABEL) schema = common.GetSoleValue(inputs, labels.SCHEMA_PATH_LABEL) input_dataset_schema = self._ReadSchema(raw_examples_data_format, schema) input_dataset_metadata = dataset_metadata.DatasetMetadata( input_dataset_schema) tf.logging.info( 'Inputs to executor.Transform function: {}'.format(inputs)) tf.logging.info( 'Outputs to executor.Transform function: {}'.format(outputs)) # NOTE: We disallow an empty schema, which we detect by testing the # number of columns. While in principal an empty schema is valid, in # practice this is a sign of a user error, and this is a convenient # place to catch that error. if (not input_dataset_metadata.schema.as_feature_spec() and not self._ShouldDecodeAsRawExample(raw_examples_data_format)): raise ValueError(messages.SCHEMA_EMPTY) preprocessing_fn = self._GetPreprocessingFn(inputs, outputs) materialize_output_paths = common.GetValues( outputs, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL) feature_spec = input_dataset_metadata.schema.as_feature_spec() # Inspecting the preprocessing_fn even if we know we need a full pass in # order to fail faster if it fails. try: analyze_input_columns = tft.get_analyze_input_columns( preprocessing_fn, feature_spec) except AttributeError: # If using TFT 1.12, fall back to assuming all features are used. analyze_input_columns = feature_spec.keys() if not compute_statistics and not materialize_output_paths: if analyze_input_columns: tf.logging.warning( 'Not using the in-place Transform because the following features ' 'require analyzing: {}'.format( tuple(c for c in analyze_input_columns))) else: tf.logging.warning( 'Using the in-place Transform since compute_statistics=False, ' 'it does not materialize transformed data, and the configured ' 'preprocessing_fn appears to not require analyzing the data.' ) self._RunInPlaceImpl(preprocessing_fn, input_dataset_metadata, transform_output_path) # TODO(b/122478841): Writes status to status file. return self._RunBeamImpl(inputs, outputs, preprocessing_fn, input_dataset_metadata, raw_examples_data_format, transform_output_path, compute_statistics, materialize_output_paths)
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # This is needed due to b/123895600. for a, b in six.iteritems(input_data_dict): input_data_dict[a] = p | a >> beam.Create(b) transform_fn, cache_output = ( (flat_data, input_data_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) transform_fn, second_output_cache = ( (flat_data, input_data_dict, cache_output, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def run(input_feature_spec, labels, feature_extraction, feature_scaling=None, eval_percent=20.0, beam_options=None, work_dir=None): """Runs the whole preprocessing step. This runs the feature extraction PTransform, validates that the data conforms to the schema provided, normalizes the features, and splits the dataset into a training and evaluation dataset. """ # Populate optional arguments if not feature_scaling: feature_scaling = lambda inputs: inputs # Type checking if not isinstance(labels, list): raise ValueError( '`labels` must be list(str). ' 'Given: {} {}'.format(labels, type(labels))) if not isinstance(feature_extraction, beam.PTransform): raise ValueError( '`feature_extraction` must be {}. ' 'Given: {} {}'.format(beam.PTransform, feature_extraction, type(feature_extraction))) if not callable(feature_scaling): raise ValueError( '`feature_scaling` must be callable. ' 'Given: {} {}'.format(feature_scaling, type(feature_scaling))) if beam_options and not isinstance(beam_options, PipelineOptions): raise ValueError( '`beam_options` must be {}. ' 'Given: {} {}'.format(PipelineOptions, beam_options, type(beam_options))) if not work_dir: work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing') tft_temp_dir = os.path.join(work_dir, 'tft-temp') train_dataset_dir = os.path.join(work_dir, 'train-dataset') eval_dataset_dir = os.path.join(work_dir, 'eval-dataset') transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR) if tf.gfile.Exists(transform_fn_dir): tf.gfile.DeleteRecursively(transform_fn_dir) with beam.Pipeline(options=beam_options) as p, \ beam_impl.Context(temp_dir=tft_temp_dir): # [START feature_extraction] dataset = ( p | 'Feature extraction' >> feature_extraction | 'Validate inputs' >> beam.ParDo(ValidateInputData(input_feature_spec))) # [END feature_extraction] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(input_feature_spec)) dataset_and_metadata, transform_fn = ( (dataset, input_metadata) | 'Feature scaling' >> beam_impl.AnalyzeAndTransformDataset(feature_scaling)) dataset, metadata = dataset_and_metadata # [END _analyze_and_transform_dataset] # [START_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)' train_dataset, eval_dataset = ( dataset | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2) ) # [END split_to_train_and_eval_datasets] # [START write_tfrecords] # # Write the datasets as TFRecords coder = example_proto_coder.ExampleProtoCoder(metadata.schema) train_dataset_prefix = os.path.join(train_dataset_dir, 'part') _ = ( train_dataset | 'Write train dataset' >> tfrecordio.WriteToTFRecord(train_dataset_prefix, coder)) eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part') _ = ( eval_dataset | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(eval_dataset_prefix, coder)) # Write the transform_fn _ = ( transform_fn | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir)) # [END write_tfrecords] return PreprocessData( input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*')
self._fn = fn def expand(self, pcoll): return pcoll | beam.ParDo(self._MapAndFilterErrorsDoFn(self._fn)) RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string)) for name in CATEGORICAL_FEATURE_KEYS] + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUMERIC_FEATURE_KEYS] + [(name, tf.io.VarLenFeature(tf.float32)) for name in OPTIONAL_NUMERIC_FEATURE_KEYS] + [(LABEL_KEY, tf.io.FixedLenFeature([], tf.string))]) RAW_DATA_METADATA = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)) # Constants used for training. Note that the number of instances will be # computed by tf.Transform in future versions, in which case it can be read from # the metadata. Similarly BUCKET_SIZES will not be needed as this information # will be stored in the metadata for each of the columns. The bucket size # includes all listed categories in the dataset description as well as one extra # for "?" which represents unknown. TRAIN_BATCH_SIZE = 128 TRAIN_NUM_EPOCHS = 200 NUM_TRAIN_INSTANCES = 32561 NUM_TEST_INSTANCES = 16281 # Names of temp files TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed' TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
def analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_specs, type_specs, transform_output_path): """Analyzes the `preprocessing_fn` in-place without looking at the data. This should only be used if the `preprocessing_fn` contains no TFT analyzers or TFT mappers that use analyzers. Writes out a transform function and transformed metadata to subdirs under `transform_output_path`. Args: preprocessing_fn: The tf.Transform preprocessing_fn. force_tf_compat_v1: If True, call Transform's API to use Tensorflow in tf.compat.v1 mode. feature_specs: a Dict from input feature key to its feature spec. type_specs: a Dict from input feature key to its type spec. transform_output_path: An absolute path to write the output to. Raises: RuntimeError if `preprocessing_fn` contains TFT analyzers. """ use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1) transform_fn_path = os.path.join(transform_output_path, TFTransformOutput.TRANSFORM_FN_DIR) if use_tf_compat_v1: graph, structured_inputs, structured_outputs = ( trace_preprocessing_function(preprocessing_fn, feature_specs, use_tf_compat_v1=use_tf_compat_v1)) _assert_no_analyzers_in_graph(graph) with tf.compat.v1.Session(graph=graph) as sess: sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.tables_initializer()) saved_transform_io.write_saved_transform_from_session( sess, structured_inputs, structured_outputs, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema( structured_outputs, graph, sess)) else: concrete_transform_fn, concrete_metadata_fn = ( trace_and_write_v2_saved_model(saved_model_dir=transform_fn_path, preprocessing_fn=preprocessing_fn, input_signature=type_specs, base_temp_dir=None, tensor_replacement_map=None, output_keys_to_name_map=None)) _assert_no_analyzers_in_graph(concrete_transform_fn.graph) # This should be a no-op as if concrete_metadata_fn is None, # `_assert_no_analyzers_in_graph` should have raised an error. assert concrete_metadata_fn structured_outputs = tf.nest.pack_sequence_as( structure=concrete_transform_fn.structured_outputs, flat_sequence=concrete_transform_fn.outputs, expand_composites=True) transformed_metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema_v2( structured_outputs, concrete_metadata_fn, evaluate_schema_overrides=True)) transformed_metadata_dir = os.path.join( transform_output_path, TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)
from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.tf_metadata import futures from tensorflow_transform.tf_metadata import metadata_io import unittest from tensorflow.python.framework import test_util from tensorflow.python.lib.io import file_io _TEST_METADATA = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (1, 3, 2), dataset_schema.FixedColumnRepresentation()), 'fixed_column_with_default': dataset_schema.ColumnSchema( tf.float32, (1, 3, 2), dataset_schema.FixedColumnRepresentation(123.4)), 'list_columm': dataset_schema.ColumnSchema(tf.float32, (None, ), dataset_schema.ListColumnRepresentation()) }) _TEST_METADATA_WITH_FUTURES = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (1, 3, 2), dataset_schema.FixedColumnRepresentation()), 'fixed_column_with_default': dataset_schema.ColumnSchema( tf.float32, (1, futures.Future('a'), 2), dataset_schema.FixedColumnRepresentation(123.4)), 'list_columm':
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = ( dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict. output_metadata, pcollections = output_metadata schema = output_metadata.schema # Update DatasetMetadata to remove excluded outputs output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) # Update pcollections to keep only pcollections that resolve futures in # the updated metadata. unresolved_future_names = set( future.name for future in output_metadata.substitute_futures({})) pcollections = { name: pcollection for name, pcollection in six.iteritems(pcollections) if name in unresolved_future_names } # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata output_metadata = beam_metadata_io.BeamDatasetMetadata( output_metadata, pcollections) else: schema = output_metadata.schema output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) def convert_and_unbatch(batch_dict): return impl_helper.to_instance_dicts(output_metadata.schema, batch_dict) serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access self.pipeline.runner)) output_instances = ( input_values | 'Batch' >> _BatchElements() | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( input_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch)) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_graph_path = os.path.join(working_dir, 'transform_graph') transformed_examples_path = os.path.join(working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path)) encoder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_graph/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
train_ques = file.readlines()[1:] xys = [] for i in range(len(train_ques)): train_ques_sp = train_ques[i].strip().replace(" ", "").replace("\n", "").replace("\r", "").upper().split(",") train_ques_sp_list = list(train_ques_sp[1]) len_real = len(train_ques_sp_list) train_ques_sp_list_pad = train_ques_sp_list[:LEN_MAX] if len_real>LEN_MAX else train_ques_sp_list + ["*"]*(LEN_MAX-len_real) xy_json = {"x":train_ques_sp_list_pad, "y":train_ques_sp[0]} xys.append(xy_json) # graph架构输入 STRING_FEATURE = {'x': tf.io.FixedLenFeature([LEN_MAX], tf.string), 'y': tf.io.FixedLenFeature([], tf.string)} DATA_STRING_FEATURE_SPEC = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(STRING_FEATURE)) def parser(x, y): features = {"x": x, "y":y} return features def train_input_fn(train, batch_size=64): x_train, y_train = train dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) dataset = dataset.shuffle(buffer_size=len(y_train)) dataset = dataset.batch(batch_size) dataset = dataset.map(parser) dataset = dataset.repeat() iterator = dataset.make_one_shot_iterator()
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): # NOTE: it's important that create_phases is called directly after # run_preprocessing_fn, because we later mutate the graph's # TABLE_INITIALIZERS collection which would break the logic in # create_phases. inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnaylzerInputs[%d]' % level >> _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements() | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs( phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the # values of tensors in the graph. In that case, the tensors must be # "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with # the tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata_tensor_names = [ future.name for column_schema in tft_api.get_column_schemas().values() for future in column_schema.substitute_futures({}) ] name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary( inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64), 'weight': tf.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn_with_cache, output_cache = ( (flat_data, input_data_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) expected_accumulators = { '__v0__VocabularyAccumulate--vocabulary--': [ b'["a", [2, 1.0, 1.0]]', b'["b", [2, 0.5, 1.0]]' ], '__v0__VocabularyAccumulate--vocabulary_1--': [ b'["a", [2, 1.0, 1.0]]', b'["b", [2, 0.5, 1.0]]' ], '__v0__VocabularyAccumulate--vocabulary_2--': [ b'["a", 1.5]', b'["b", 1.75]' ], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for key, value in six.iteritems(expected_accumulators): self.assertCountEqual(output_cache[span][key], value) transform_fn_no_cache = ((input_data * 2, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.gfile.Open(cache_path, 'rb') as f1, tf.gfile.Open( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
for i in range(28) for j in range(28)] label_field = "label" def preprocessing_fn(inputs): # TODO: pre-process the fields to 0 to 1 range inputs result = {label_field: inputs[label_field]} for field_name in pixel_fields: result[field_name + "_norm"] = tft.scale_to_0_1(inputs[field_name]) return result input_data_schema = dataset_schema.from_feature_spec(dict( [(name, tf.io.FixedLenFeature([], tf.int64)) for name in pixel_fields] + [(label_field, tf.io.FixedLenFeature([], tf.int64))])) input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema) fetch_data() train_data_file = mnist_path + "/train.csv" with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): columns = [label_field] + pixel_fields converter = tft.coders.CsvCoder(columns, input_data_schema) input_data = ( pipeline | 'ReadInputData' >> beam.io.ReadFromText(train_data_file) | 'CleanInputData' >> beam.Map(converter.decode)) input_dataset = (input_data, input_data_metadata) transformed_dataset, transform_fn = ( input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError( 'The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = (tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs[%d]' % level >> _ReplaceTensorsWithConstants( unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements() | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared()), saved_model_dir=beam.pvalue.AsSingleton( saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the # values of tensors in the graph. In that case, the tensors must be # "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with # the tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata_tensor_names = { future.name for column_schema in metadata.schema.column_schemas.values() for future in column_schema.substitute_futures({}) } name_pcoll_dict = (tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues( deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Test metadata for tft_beam_io tests.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_metadata.proto.v0 import schema_pb2 _FEATURE_SPEC = { 'fixed_column': tf.io.FixedLenFeature([3], tf.string), 'list_columm': tf.io.VarLenFeature(tf.int64), } COMPLETE_METADATA = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( _FEATURE_SPEC, domains={'list_columm': schema_pb2.IntDomain(min=-1, max=5)})) INCOMPLETE_METADATA = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( _FEATURE_SPEC, # Values will be overridden by those in COMPLETE_METADATA domains={'list_columm': schema_pb2.IntDomain(min=0, max=0)}))
num_ranking_candidate_movie_ids=0, negative_sample_ratio=args.negative_sample_ratio, negative_sample_label=args.negative_sample_label, movie_rating_history=args.movie_rating_history) movies_sideinput = beam.pvalue.AsDict(movies_data) eval_data |= 'BuildEvalFeatures' >> beam.ParDo( BuildExampleFn(args.random_seed), movies_data=movies_sideinput, rating_threshold=args.eval_score_threshold, is_ranking_problem=(args.eval_type == RANKING), is_train=False, num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids) # TFTransform based preprocessing. raw_metadata = dataset_metadata.DatasetMetadata( schema=movielens.make_examples_schema()) _ = (raw_metadata | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'raw_metadata'), pipeline)) preprocessing_fn = movielens.make_preprocessing_fn() transform_fn = ((train_data, raw_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, raw_metadata), transform_fn)
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transform_fn_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transform_fn_dir: Directory where metadata for transform function should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = ( transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(transform_fn_dir))
def _RunBeamImpl(self, inputs: Mapping[Text, Any], outputs: Mapping[Text, Any], preprocessing_fn: Any, input_dataset_metadata: dataset_metadata.DatasetMetadata, raw_examples_data_format: Text, transform_output_path: Text, compute_statistics: bool, materialize_output_paths: Sequence[Text]) -> _Status: """Perform data preprocessing with FlumeC++ runner. Args: inputs: A dictionary of labelled input values. outputs: A dictionary of labelled output values. preprocessing_fn: The tf.Transform preprocessing_fn. input_dataset_metadata: A DatasetMetadata object for the input data. raw_examples_data_format: A string describing the raw data format. transform_output_path: An absolute path to write the output to. compute_statistics: A bool indicating whether or not compute statistics. materialize_output_paths: Paths to materialized outputs. Raises: RuntimeError: If reset() is not being invoked between two run(). ValueError: If the schema is empty. Returns: Status of the execution. """ raw_examples_file_format = common.GetSoleValue( inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False) analyze_and_transform_data_paths = common.GetValues( inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL) transform_only_data_paths = common.GetValues( inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL) stats_use_tfdv = common.GetSoleValue(inputs, labels.TFT_STATISTICS_USE_TFDV_LABEL) per_set_stats_output_paths = common.GetValues( outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL) temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL) input_cache_dir = common.GetSoleValue( inputs, labels.CACHE_INPUT_PATH_LABEL, strict=False) output_cache_dir = common.GetSoleValue( outputs, labels.CACHE_OUTPUT_PATH_LABEL, strict=False) tf.logging.info('Analyze and transform data patterns: %s', list(enumerate(analyze_and_transform_data_paths))) tf.logging.info('Transform data patterns: %s', list(enumerate(transform_only_data_paths))) tf.logging.info('Transform materialization output paths: %s', list(enumerate(materialize_output_paths))) tf.logging.info('Transform output path: %s', transform_output_path) feature_spec = schema_utils.schema_as_feature_spec( _GetSchemaProto(input_dataset_metadata)).feature_spec try: analyze_input_columns = tft.get_analyze_input_columns( preprocessing_fn, feature_spec) transform_input_columns = ( tft.get_transform_input_columns(preprocessing_fn, feature_spec)) except AttributeError: # If using TFT 1.12, fall back to assuming all features are used. analyze_input_columns = feature_spec.keys() transform_input_columns = feature_spec.keys() # Use the same dataset (same columns) for AnalyzeDataset and computing # pre-transform stats so that the data will only be read once for these # two operations. if compute_statistics: analyze_input_columns = list( set(list(analyze_input_columns) + list(transform_input_columns))) if input_dataset_metadata.schema is _RAW_EXAMPLE_SCHEMA: analyze_input_dataset_metadata = input_dataset_metadata transform_input_dataset_metadata = input_dataset_metadata else: analyze_input_dataset_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( {feature: feature_spec[feature] for feature in analyze_input_columns})) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( {feature: feature_spec[feature] for feature in transform_input_columns})) can_process_jointly = not bool(per_set_stats_output_paths or materialize_output_paths or output_cache_dir) analyze_data_list = self._MakeDatasetList( analyze_and_transform_data_paths, raw_examples_file_format, raw_examples_data_format, analyze_input_dataset_metadata, can_process_jointly) transform_data_list = self._MakeDatasetList( list(analyze_and_transform_data_paths) + list(transform_only_data_paths), raw_examples_file_format, raw_examples_data_format, transform_input_dataset_metadata, can_process_jointly) desired_batch_size = self._GetDesiredBatchSize(raw_examples_data_format) with self._CreatePipeline(outputs) as p: with tft_beam.Context( temp_dir=temp_path, desired_batch_size=desired_batch_size, passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY}, use_deep_copy_optimization=True): # pylint: disable=expression-not-assigned # pylint: disable=no-value-for-parameter _ = ( p | self._IncrementColumnUsageCounter( len(feature_spec.keys()), len(analyze_input_columns), len(transform_input_columns))) (new_analyze_data_dict, input_cache, flat_data_required) = ( p | self._OptimizeRun(input_cache_dir, output_cache_dir, analyze_data_list, feature_spec, preprocessing_fn, self._GetCacheSource())) # Removing unneeded datasets if they won't be needed for statistics or # materialization. if not materialize_output_paths and not compute_statistics: analyze_data_list = [ d for d in new_analyze_data_dict.values() if d is not None ] if len(analyze_data_list) < len(new_analyze_data_dict): tf.logging.info( 'Not reading the following datasets due to cache: %s', [ dataset.file_pattern_suffix for dataset in analyze_data_list if dataset not in new_analyze_data_dict.values() ]) analyze_decode_fn = ( self._GetDecodeFunction(raw_examples_data_format, analyze_input_dataset_metadata.schema)) for (idx, dataset) in enumerate(analyze_data_list): dataset.encoded = ( p | 'ReadAnalysisDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeAnalysisDataset[{}]'.format(idx) >> self._DecodeInputs(analyze_decode_fn)) input_analysis_data = {} for key, dataset in six.iteritems(new_analyze_data_dict): if dataset is None: input_analysis_data[key] = None else: input_analysis_data[key] = dataset.decoded if flat_data_required: flat_input_analysis_data = ( [dataset.decoded for dataset in analyze_data_list] | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=p)) else: flat_input_analysis_data = None if input_cache: tf.logging.info('Analyzing data with cache.') transform_fn, cache_output = ( (flat_input_analysis_data, input_analysis_data, input_cache, input_dataset_metadata) | 'AnalyzeDataset' >> tft_beam.AnalyzeDatasetWithCache( preprocessing_fn, pipeline=p)) # Write the raw/input metadata. (input_dataset_metadata | 'WriteMetadata' >> tft_beam.WriteMetadata( os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR), p)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) if output_cache_dir is not None and cache_output is not None: # TODO(b/37788560): Possibly make this part of the beam graph. tf.io.gfile.makedirs(output_cache_dir) tf.logging.info('Using existing cache in: %s', input_cache_dir) if input_cache_dir is not None: # Only copy cache that is relevant to this iteration. This is # assuming that this pipeline operates on rolling ranges, so those # cache entries may also be relevant for future iterations. for span_cache_dir in input_analysis_data: full_span_cache_dir = os.path.join(input_cache_dir, span_cache_dir) if tf.io.gfile.isdir(full_span_cache_dir): self._CopyCache(full_span_cache_dir, os.path.join(output_cache_dir, span_cache_dir)) (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( p, output_cache_dir, sink=self._GetCacheSink())) if compute_statistics or materialize_output_paths: # Do not compute pre-transform stats if the input format is raw proto, # as StatsGen would treat any input as tf.Example. if (compute_statistics and not self._IsDataFormatProto(raw_examples_data_format)): # Aggregated feature stats before transformation. pre_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH) schema_proto = _GetSchemaProto(analyze_input_dataset_metadata) ([ dataset.decoded if stats_use_tfdv else dataset.encoded for dataset in analyze_data_list ] | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten(pipeline=p) | 'GenerateAggregatePreTransformAnalysisStats' >> self._GenerateStats( pre_transform_feature_stats_path, schema_proto, use_deep_copy_optimization=True, use_tfdv=stats_use_tfdv)) transform_decode_fn = ( self._GetDecodeFunction(raw_examples_data_format, transform_input_dataset_metadata.schema)) # transform_data_list is a superset of analyze_data_list, we pay the # cost to read the same dataset (analyze_data_list) again here to # prevent certain beam runner from doing large temp materialization. for (idx, dataset) in enumerate(transform_data_list): dataset.encoded = ( p | 'ReadTransformDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeTransformDataset[{}]'.format(idx) >> self._DecodeInputs(transform_decode_fn)) (dataset.transformed, metadata) = (((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'TransformDataset[{}]'.format(idx) >> tft_beam.TransformDataset()) if materialize_output_paths or not stats_use_tfdv: dataset.transformed_and_encoded = ( dataset.transformed | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo( self._EncodeAsExamples(), metadata)) if compute_statistics: # Aggregated feature stats after transformation. _, metadata = transform_fn post_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH) # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in # schema. Currently input dataset schema only contains dtypes, # and other metadata is dropped due to roundtrip to tensors. transformed_schema_proto = _GetSchemaProto(metadata) ([(dataset.transformed if stats_use_tfdv else dataset.transformed_and_encoded) for dataset in transform_data_list] | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten() | 'GenerateAggregatePostTransformAnalysisStats' >> self._GenerateStats( post_transform_feature_stats_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if per_set_stats_output_paths: assert len(transform_data_list) == len(per_set_stats_output_paths) # TODO(b/67632871): Remove duplicate stats gen compute that is # done both on a flattened view of the data, and on each span # below. bundles = zip(transform_data_list, per_set_stats_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): if stats_use_tfdv: data = dataset.transformed else: data = dataset.transformed_and_encoded (data | 'GeneratePostTransformStats[{}]'.format(idx) >> self._GenerateStats( output_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if materialize_output_paths: assert len(transform_data_list) == len(materialize_output_paths) bundles = zip(transform_data_list, materialize_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): (dataset.transformed_and_encoded | 'Materialize[{}]'.format(idx) >> self._WriteExamples( raw_examples_file_format, output_path)) return _Status.OK()
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map( taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle() decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz') )
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema input_batches = input_values | 'BatchInstances' >> beam.ParDo( _BatchDoFn()) class _CreateTransformFn(beam.PTransform): """Create a TransformFnDef, binding statistics in a deferred manner. This function constructs a tensorflow graph eagerly and then (in a deferred manner) fills in analyzer outputs with their actual computed values. We construct the tensorflow graph up front because that implies serializing MetaGraphDef protos rather than pickling the user-defined TITO functions. The graph contains placeholders for `_AnalyzerOutput`s which are then replaced with their actual values (as constant tensors) in a deferred manner. Args: input_columns: A map from column names to `Column`s. output_columns: A map from column names to `Column`s. temp_dir: Temp dir to store `SavedModel`s. """ def __init__(self, input_columns, output_columns, temp_dir): # Generally the pipeline is inferred from its inputs, however we need # to know the pipeline for beam.Create. self.pipeline = input_values.pipeline self._input_columns = input_columns self._output_columns = output_columns self._temp_dir = temp_dir def expand(self, analyzer_outputs_to_pcoll): """Converts a dict of statistics to a transform function. Args: analyzer_outputs_to_pcoll: A dictionary mapping `_AnalyzerOutput`s to the values of these statistics as a PCollection. Returns: A single-element PCollection containing the directory name with the SavedModel. """ # Create a transform_fn with unbound values. unbound_transform_fn_dir = os.path.join( self._temp_dir, 'unbound_transform_fn') input_columns_to_statistics = impl_helper.make_transform_fn_def( input_schema, self._input_columns, self._output_columns, unbound_transform_fn_dir) transform_fn = (self.pipeline | 'CreateTransformFn' >> beam.Create([unbound_transform_fn_dir])) if not analyzer_outputs_to_pcoll: return transform_fn # Convert the statistics dict into a DictPCollectionView so it can be # passed as a side input to the beam Map below. tagged_statistics = [] for tag, statistic in input_columns_to_statistics.items(): pcoll = analyzer_outputs_to_pcoll[statistic] tagged_statistics.append( pcoll | 'AddTag[%s]' % tag >> beam.Map(lambda x, tag=tag: (tag, x))) statistics_side_input = beam.pvalue.AsDict( tagged_statistics | 'MergeStatistics' >> beam.Flatten()) # Run a mapper that inserts statistic values into the graph. return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map( impl_helper.replace_tensors_with_constant_values, bound_saved_model_dir=os.path.join( self._temp_dir, 'transform_fn'), input_value_mapping=statistics_side_input)) inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) # Get a list of lists, containing analyzers (i.e. _AnalyzerOutput objects) # by level in the DAG of Columns/Statistics. Analyzers at level n are ready # to run once all analyzers at level n - 1 are complete. analyzers_by_level = self._analyzers_by_level(outputs) # Iterate through levels, keeping track of analyzer outputs (i.e. # statistics) via a mapping of `_AnalyzerOutput` -> single element # PCollection. analyzer_outputs_to_pcoll = {} for level, analyzer_outputs in enumerate(analyzers_by_level): # Create a TransformFnDef representing the graph needed to generate # all the inputs required by the analyzer_outputs at this level. We # assign arbitrary names to the outputs of this TransformFnDef. analyzer_input_columns = {} for idx, analyzer_output in enumerate(analyzer_outputs): if len(analyzer_output.inputs) != 1: raise NotImplementedError( 'Analyzers must have exactly one input') analyzer_input_key = 'analyzer_%d_input' % idx analyzer_input_columns[ analyzer_input_key] = analyzer_output.inputs[0] transform_fn = ( analyzer_outputs_to_pcoll | 'CreateTransformFn_%d' % level >> _CreateTransformFn( inputs, analyzer_input_columns, os.path.join(self._output_dir, 'tmp', 'level_%s' % level))) analyzer_input_schema = impl_helper.infer_feature_schema( analyzer_input_columns) # Run the TransformFnDef in a mapper. analysis_inputs = ( input_batches | 'ComputeAnalyzerInputs_%d' % level >> beam.ParDo( _RunMetaGraphDoFn(input_schema, analyzer_input_schema), saved_model_dir=beam.pvalue.AsSingleton(transform_fn))) # For each analyzer output, look up its input values (by tensor name) # and run the analyzer in these values. for idx, analyzer_output in enumerate(analyzer_outputs): analyzer_input_key = 'analyzer_%d_input' % idx analyzer_outputs_to_pcoll[analyzer_output] = ( analysis_inputs | 'Extract_%d_%d' % (level, idx) >> beam.Map( # pylint: disable=cell-var-from-loop # This lint warning is prone to false positives, and it's not # clear why the warning is required here. lambda x, key=analyzer_input_key: [inst[key] for inst in x]) | 'Analyze_%d_%d' % (level, idx) >> self._Analyze(analyzer_output)) output_metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) transform_fn = (analyzer_outputs_to_pcoll | 'CreateTransformFn' >> _CreateTransformFn( inputs, outputs, self._output_dir)) return transform_fn, output_metadata
import apache_beam as beam import tensorflow as tf from tensorflow_transform.beam.tft_beam_io import beam_metadata_io from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.tf_metadata import metadata_io import unittest from tensorflow.python.framework import test_util _TEST_METADATA_COMPLETE = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (3, ), dataset_schema.FixedColumnRepresentation()), 'list_columm': dataset_schema.ColumnSchema( dataset_schema.IntDomain(tf.int64, min_value=-1, max_value=5), (None, ), dataset_schema.ListColumnRepresentation()) }) _TEST_METADATA = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (3, ), dataset_schema.FixedColumnRepresentation()), # zeros will be overriddden 'list_columm': dataset_schema.ColumnSchema( dataset_schema.IntDomain(tf.int64, min_value=0, max_value=0), (None, ), dataset_schema.ListColumnRepresentation()) })
def toMetadata(self, feature_spec): return dataset_metadata.DatasetMetadata( schema=sch.from_feature_spec(feature_spec))
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict, input_metadata) = dataset input_schema = input_metadata.schema input_values_pcoll_dict = input_values_pcoll_dict or dict() with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): feature_spec = schema_utils.schema_as_feature_spec( input_schema).feature_spec input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError( 'The preprocessing function returned an empty dict') if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))) pipeline = self.pipeline or (flattened_pcoll or next( v for v in input_values_pcoll_dict.values() if v is not None)).pipeline # Add a stage that inspects graph collections for API use counts and logs # them as a beam metric. _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph)) tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get( type(pipeline.runner)) extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), tf_config=tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, cache_pcoll_dict=dataset_cache_dict) transform_fn_future, cache_value_nodes = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), cache_dict=dataset_cache_dict) traverser = nodes.Traverser( beam_common.ConstructBeamPipelineVisitor(extra_args)) transform_fn_pcoll = traverser.visit_value_node(transform_fn_future) if cache_value_nodes is not None: output_cache_pcoll_dict = {} for (dataset_key, cache_key), value_node in six.iteritems(cache_value_nodes): if dataset_key not in output_cache_pcoll_dict: output_cache_pcoll_dict[dataset_key] = {} output_cache_pcoll_dict[dataset_key][cache_key] = ( traverser.visit_value_node(value_node)) else: output_cache_pcoll_dict = None # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _infer_metadata_from_saved_model will use the # analyzer outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema( output_signature, graph)) deferred_metadata = (transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def transform_ngrams(input, ngram_range): """ helper function to transform ngrams and print output. """ # this print statement causes output to concat itself! # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100) transformed = transform.ngrams( tf.string_split(input, delimiter=" "), ngram_range=ngram_range, separator=' ') # SparseTensor basically cannot be printed because it's made up of 3 # tensors. We can use this trick to print the values column, but without the index # it's not too meaningful. # # values = tf.Print(transformed.values, [transformed.values], "ngram output:") # transformed = tf.SparseTensor( # indices=transformed.indices, # values=values, # dense_shape=transformed.dense_shape) return transformed def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2 Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: print('processing key', key) print('input:', inputs[key]) # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) # for key in taxi.FEATURE_NGRAM: # # Extract nggrams and build a vocab. # outputs[ # taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( # transform.ngrams( # tf.string_split(_fill_in_missing(inputs[key])), # ngram_range=taxi.NGRAM_RANGE, # separator=' '), # top_k=512, # num_oov_buckets=taxi.OOV_SIZE) for key in taxi.FEATURE_NGRAM: # Extract nggrams and build a vocab. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema, input_handle.lower()) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map( taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle() decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz') )
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) transform_fn = ((train_data, input_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX) _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX) # TODO(b/35300113) Remember to eventually also save the statistics. if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
self._fn = fn def expand(self, pcoll): return pcoll | beam.ParDo(self._MapAndFilterErrorsDoFn(self._fn)) RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string)) for name in CATEGORICAL_FEATURE_KEYS] + [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUMERIC_FEATURE_KEYS] + [(name, tf.io.VarLenFeature(tf.float32)) for name in OPTIONAL_NUMERIC_FEATURE_KEYS] + [(LABEL_KEY, tf.io.FixedLenFeature([], tf.string))]) RAW_DATA_METADATA = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(RAW_DATA_FEATURE_SPEC)) # Constants used for training. Note that the number of instances will be # computed by tf.Transform in future versions, in which case it can be read from # the metadata. Similarly BUCKET_SIZES will not be needed as this information # will be stored in the metadata for each of the columns. The bucket size # includes all listed categories in the dataset description as well as one extra # for "?" which represents unknown. TRAIN_BATCH_SIZE = 128 TRAIN_NUM_EPOCHS = 200 NUM_TRAIN_INSTANCES = 32561 NUM_TEST_INSTANCES = 16281 # Names of temp files TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed' TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_tsv_coder(input_schema) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> io.WriteMetadata(os.path.join( output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) # TODO(b/33688220) should the transform functions take shuffle as an optional # argument? # TODO(b/33688275) Should the transform functions have more user friendly # names? work_dir = os.path.join(output_dir, path_constants.TEMP_DIR) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn, work_dir)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir)) # TODO(b/34231369) Remember to eventually also save the statistics. (evaluate_dataset, evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_coder = coders.ExampleProtoCoder(train_metadata.schema) _ = (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) _ = (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary(inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first')
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path, project_id): """Main processing pipeline reading, processing and storing processed data. Performs the following operations: - reads data from BigQuery - adds hash key value to each row - scales data - shuffles and splits data in train / validation / test sets - oversamples train data - stores data as TFRecord - splits and stores test data into labels and features files Args: p: PCollection, initial pipeline. output_dir: string, path to directory to store output. check_path: string, path to directory to store data checks. data_size: tuple of float, ratio of data going respectively to train, validation and test sets. bq_table: string, name of table to read data from. split_data_path: string, path to directory to store train, validation and test raw datasets. project_id: string, GCP project id. Raises: ValueError: No test dataset found in pipeline output. """ train_size, validation_size, test_size = data_size data = (p | 'ReadData' >> read_data(bq_table=bq_table, project_id=project_id)) _ = data | 'StoreData' >> beam.io.WriteToText( posixpath.join(output_dir, check_path, 'processed_data.txt')) split_data = ( data | 'RandomlySplitData' >> randomly_split(train_size=train_size, validation_size=validation_size, test_size=test_size)) for k in split_data: split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo( AddHash(), label_column=constants.LABEL_COLUMN, key_column=constants.KEY_COLUMN, dtype=k) # Splits test data into features pipeline and labels pipeline. if DatasetType.TEST not in split_data: raise ValueError('No test dataset found in pipeline output.') test_data = (split_data.pop(DatasetType.TEST) | 'SplitFeaturesLabels' >> split_features_labels( constants.LABEL_COLUMN, constants.KEY_COLUMN)) # Stores test data features and labels pipeline separately. for k in test_data: _ = (test_data[k] | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps) | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText( posixpath.join( output_dir, split_data_path, 'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k)))) meta_data = dataset_metadata.DatasetMetadata(make_input_schema()) transform_fn = ( (split_data[DatasetType.TRAIN], meta_data) | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn( posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION))) _ = (meta_data | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata( posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA), pipeline=p)) transformed_metadata, transformed_data = {}, {} for k in [DatasetType.TRAIN, DatasetType.VAL]: transformed_data[k], transformed_metadata[k] = ( ((split_data[k], meta_data), transform_fn) | 'Transform{}'.format(k) >> beam_impl.TransformDataset()) transformed_data[DatasetType.TRAIN] = ( transformed_data[DatasetType.TRAIN] | 'OverSampleTraining' >> oversampling()) for k in transformed_data: _ = (transformed_data[k] | 'ShuffleData{}'.format(k) >> shuffle_data() | 'StoreData{}'.format(k) >> store_transformed_data( schema=transformed_metadata[k], path=posixpath.join(output_dir, constants.PATH_TRANSFORMED_DATA_SPLIT[k]), name=DatasetType(k).name)) for k in transformed_data: _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size( name=DatasetType(k).name, path=posixpath.join(output_dir, check_path, k.name)))