def write_metadata(futures_dict, non_deferred_metadata, destination): unresolved_futures = non_deferred_metadata.substitute_futures( futures_dict) if unresolved_futures: raise ValueError('Some futures were unresolved: %r' % unresolved_futures) metadata_io.write_metadata(non_deferred_metadata, destination)
def analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_specs, type_specs, transform_output_path): """Analyzes the `preprocessing_fn` in-place without looking at the data. This should only be used if the `preprocessing_fn` contains no TFT analyzers or TFT mappers that use analyzers. Writes out a transform function and transformed metadata to subdirs under `transform_output_path`. Args: preprocessing_fn: The tf.Transform preprocessing_fn. force_tf_compat_v1: If True, call Transform's API to use Tensorflow in tf.compat.v1 mode. feature_specs: a Dict from input feature key to its feature spec. type_specs: a Dict from input feature key to its type spec. transform_output_path: An absolute path to write the output to. Raises: RuntimeError if `preprocessing_fn` contains TFT analyzers. """ use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1) transform_fn_path = os.path.join(transform_output_path, TFTransformOutput.TRANSFORM_FN_DIR) if use_tf_compat_v1: graph, structured_inputs, structured_outputs = ( trace_preprocessing_function(preprocessing_fn, feature_specs, use_tf_compat_v1=use_tf_compat_v1)) _assert_no_analyzers_in_graph(graph) with tf.compat.v1.Session(graph=graph) as sess: sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.tables_initializer()) saved_transform_io.write_saved_transform_from_session( sess, structured_inputs, structured_outputs, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema( structured_outputs, graph, sess)) else: concrete_transform_fn = _trace_and_write_transform_fn( saved_model_dir=transform_fn_path, preprocessing_fn=preprocessing_fn, input_signature=type_specs, base_temp_dir=None, tensor_replacement_map=None, output_keys_to_name_map=None) _assert_no_analyzers_in_graph(concrete_transform_fn.graph) structured_inputs = tf2_utils.get_structured_inputs_from_func_graph( concrete_transform_fn.graph) transformed_metadata = _trace_and_get_metadata( concrete_transform_fn=concrete_transform_fn, structured_inputs=structured_inputs, preprocessing_fn=preprocessing_fn, base_temp_dir=None, tensor_replacement_map=None) transformed_metadata_dir = os.path.join( transform_output_path, TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)
def trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn, input_signature, base_temp_dir, tensor_replacement_map, output_keys_to_name_map): """Writes out a SavedModelV2 with preprocessing_fn traced using tf.function. The SavedModel written contains a method called `transform_fn` that represents the traced `preprocessing_fn`. Additionally, if this is the final SavedModel being written out, it will contain a method called `metadata_fn` that provides deferred schema annotations. Args: saved_model_dir: Path to write SavedModel to. preprocessing_fn: A user defined python function to be traced. input_signature: TypeSpecs describing the inputs to the `preprocessing_fn`. base_temp_dir: Base path to write temporary artifacts to. tensor_replacement_map: A map from placeholder tensor names to their evaluated replacement tensors. output_keys_to_name_map: A map from output dictionary keys to the names of the tensors that they represent. Returns: A tuple containing a pair of `tf.ConcreteFunction`s: 1. The traced preprocessing_fn. 2. A metadata_fn that returns a dictionary containing the deferred annotations added to the graph when invoked with any valid input. """ transform_fn = get_traced_transform_fn( preprocessing_fn, input_signature, base_temp_dir, tensor_replacement_map=tensor_replacement_map, output_keys_to_name_map=output_keys_to_name_map) concrete_transform_fn = _write_v2_saved_model(transform_fn, 'transform_fn', saved_model_dir) concrete_metadata_fn = None # If the `TENSOR_REPLACEMENTS` graph collection is empty, all TFT analyzers # in the `preprocessing_fn` have already been evaluated. if not concrete_transform_fn.graph.get_collection( analyzer_nodes.TENSOR_REPLACEMENTS): metadata_fn = schema_inference.get_traced_metadata_fn( tensor_replacement_map, preprocessing_fn, input_signature, base_temp_dir, evaluate_schema_overrides=True) concrete_metadata_fn = metadata_fn.get_concrete_function() metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema_v2( concrete_transform_fn.structured_outputs, concrete_metadata_fn, evaluate_schema_overrides=True)) metadata_io.write_metadata( metadata, os.path.join(saved_model_dir, METADATA_DIR_NAME)) return concrete_transform_fn, concrete_metadata_fn
def make_spec(output_dir, batch_size=None): fixed_shape = [batch_size, 1] if batch_size is not None else [] spec = {} spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature( shape=fixed_shape, dtype=tf.int64, default_value=None) for name in BOOL_COLUMNS: spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM: spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) for name in INT_COLUMNS: spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS: spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS: #spec[multi_category] = tf.VarLenFeature(dtype=tf.int64) shape = fixed_shape[:-1] + [ len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]) ] spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64) metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(spec)) metadata_io.write_metadata(metadata, output_dir)
def trace_and_write_v2_saved_model( saved_model_dir: str, preprocessing_fn: Callable[[Mapping[str, common_types.TensorType]], Mapping[str, common_types.TensorType]], input_signature: Mapping[str, tf.TypeSpec], base_temp_dir: Optional[str], baseline_analyzers_fingerprint: Mapping[ str, graph_tools.AnalyzersFingerprint], tensor_replacement_map: Optional[Dict[str, tf.Tensor]], output_keys_to_name_map: Optional[Dict[str, str]]): """Writes out a SavedModelV2 with preprocessing_fn traced using tf.function. The SavedModel written contains a method called `transform_fn` that represents the traced `preprocessing_fn`. Additionally, if this is the final SavedModel being written out, it will contain a method called `metadata_fn` that provides deferred schema annotations. Args: saved_model_dir: Path to write SavedModel to. preprocessing_fn: A user defined python function to be traced. input_signature: TypeSpecs describing the inputs to the `preprocessing_fn`. base_temp_dir: Base path to write temporary artifacts to. baseline_analyzers_fingerprint: A mapping from analyzer name to a set of paths that define its fingerprint. tensor_replacement_map: A map from placeholder tensor names to their evaluated replacement tensors. output_keys_to_name_map: A map from output dictionary keys to the names of the tensors that they represent. Returns: A tuple containing a pair of `tf.ConcreteFunction`s: 1. The traced preprocessing_fn. 2. A metadata_fn that returns a dictionary containing the deferred annotations added to the graph when invoked with any valid input. Raises: RuntimeError: if analyzers in `preprocessing_fn` are encountered in a non-deterministic order. """ concrete_transform_fn = _trace_and_write_transform_fn( saved_model_dir, preprocessing_fn, input_signature, base_temp_dir, tensor_replacement_map, output_keys_to_name_map) structured_inputs = tf2_utils.get_structured_inputs_from_func_graph( concrete_transform_fn.graph) _validate_analyzers_fingerprint(baseline_analyzers_fingerprint, concrete_transform_fn.graph, structured_inputs) # If the `TENSOR_REPLACEMENTS` graph collection is empty, all TFT analyzers # in the `preprocessing_fn` have already been evaluated. if not concrete_transform_fn.graph.get_collection( analyzer_nodes.TENSOR_REPLACEMENTS): metadata = _trace_and_get_metadata(concrete_transform_fn, structured_inputs, preprocessing_fn, base_temp_dir, tensor_replacement_map) metadata_io.write_metadata( metadata, os.path.join(saved_model_dir, METADATA_DIR_NAME))
def _RunInPlaceImpl(self, preprocessing_fn: Any, metadata: dataset_metadata.DatasetMetadata, transform_output_path: Text) -> _Status: """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( schema_utils.schema_as_feature_spec( _GetSchemaProto(metadata)).feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def test_write_and_read(self): basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir, versions=_test_versions) reloaded = metadata_io.read_metadata(basedir, versions=_test_versions) generated_feature_spec = reloaded.schema.as_feature_spec() self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
def test_write_and_read(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir) reloaded = metadata_io.read_metadata(basedir) self.assertEqual(original, reloaded)
def test_write_and_read(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir) reloaded = metadata_io.read_metadata(basedir) generated_feature_spec = reloaded.schema.as_feature_spec() self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
def create_metadata(df, prebatch_size, output_path): fixed_shape = [prebatch_size, 1] spec = {} for column in df: if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]: spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) else: spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec)) metadata_io.write_metadata(metadata, output_path)
def _RunInPlaceImpl(self, preprocessing_fn, metadata, transform_output_path): """Runs a transformation iteration in-place without looking at the data. Args: preprocessing_fn: The tf.Transform preprocessing_fn. metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. Returns: Status of the execution. """ tf.logging.info('Processing an in-place transform') raw_metadata_dir = os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR) metadata_io.write_metadata(metadata, raw_metadata_dir) with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as sess: input_signature = impl_helper.feature_spec_as_batched_placeholders( metadata.schema.as_feature_spec()) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = preprocessing_fn(copied_inputs) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) transform_fn_path = os.path.join(transform_output_path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_transform_io.write_saved_transform_from_session( sess, input_signature, output_signature, transform_fn_path) transformed_metadata = dataset_metadata.DatasetMetadata( schema=tft.schema_inference.infer_feature_schema( output_signature, graph, sess)) transformed_metadata_dir = os.path.join( transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR) metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir) return _Status.OK()
def test_write_and_read(self): basedir = tempfile.mkdtemp() original_schema = schema_io_vtest.TestSchema( {'test_feature_1': 'bogus 1', 'test_feature_2': 'bogus 2'}) original = dataset_metadata.DatasetMetadata(schema=original_schema) metadata_io.write_metadata(original, basedir, versions=_test_versions) reloaded = metadata_io.read_metadata(basedir, versions=_test_versions) self.assertTrue('test_feature_1' in reloaded.schema.column_schemas) self.assertTrue('test_feature_2' in reloaded.schema.column_schemas) self.assertEqual(2, len(reloaded.schema.column_schemas))
def write_metadata_output(metadata): output_path = self._path if self._write_to_unique_subdirectory: output_path = common.get_unique_temp_path(self._path) metadata_io.write_metadata(metadata, output_path) if asset_map: with tf.io.gfile.GFile( os.path.join( output_path, output_wrapper.TFTransformOutput.ASSET_MAP), 'w') as f: f.write(json.dumps(asset_map)) return output_path
def testReadTransformFn(self): path = self.get_temp_dir() # NOTE: we don't need to create or write to the transform_fn directory since # ReadTransformFn never inspects this directory. transform_fn_dir = os.path.join(path, 'transform_fn') transformed_metadata_dir = os.path.join(path, 'transformed_metadata') metadata_io.write_metadata(_TEST_METADATA, transformed_metadata_dir) with beam.Pipeline() as pipeline: saved_model_dir_pcoll, metadata = ( pipeline | transform_fn_io.ReadTransformFn(path)) beam_test_util.assert_that(saved_model_dir_pcoll, beam_test_util.equal_to( [transform_fn_dir]), label='AssertSavedModelDir') # NOTE: metadata is currently read in a non-deferred manner. self.assertEqual(metadata, _TEST_METADATA)
def make_transform_graph(output_dir, schema, features): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list features: features dict """ tft_input_schema = make_tft_input_schema( schema, os.path.join(output_dir, STATS_FILE)) tft_input_metadata = dataset_metadata.DatasetMetadata( schema=tft_input_schema) preprocessing_fn = make_preprocessing_fn(output_dir, features) # preprocessing_fn does not use any analyzer, so we can run a local beam job # to properly make and write the transform function. temp_dir = os.path.join(output_dir, 'tmp') with beam.Pipeline('DirectRunner', options=None) as p: with tft_impl.Context(temp_dir=temp_dir): # Not going to transform, so no data is needed. train_data = p | beam.Create([]) transform_fn = ( (train_data, tft_input_metadata) | 'BuildTransformFn' # noqa >> tft_impl.AnalyzeDataset(preprocessing_fn)) # noqa # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir) ) # noqa # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, RAW_METADATA_DIR))
def make_transform_graph(output_dir, schema, features): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list features: features dict """ tft_input_schema = make_tft_input_schema(schema, os.path.join(output_dir, STATS_FILE)) tft_input_metadata = dataset_metadata.DatasetMetadata(schema=tft_input_schema) preprocessing_fn = make_preprocessing_fn(output_dir, features) # copy from /tft/beam/impl inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn=preprocessing_fn, schema=tft_input_schema) output_metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) transform_fn_dir = os.path.join(output_dir, TRANSFORM_FN_DIR) # This writes the SavedModel impl_helper.make_transform_fn_def( schema=tft_input_schema, inputs=inputs, outputs=outputs, saved_model_dir=transform_fn_dir) metadata_io.write_metadata( metadata=output_metadata, path=os.path.join(output_dir, TRANSFORMED_METADATA_DIR)) metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(output_dir, RAW_METADATA_DIR))
def _create_test_data(self): """Makes local test data. The fllowing files and folders will be created in self.output_folder: self.output_folder/ features.json img.png input.csv schema.json raw_metadata/ (tft metadata files) transformed_metadata/ (tft metadata files) transform_fn/ (tft saved model file) """ self.output_folder = tempfile.mkdtemp() # Make image file self.img_filepath = os.path.join(self.output_folder, 'img.png') image = Image.new('RGBA', size=(50, 50), color=(155, 0, 0)) image.save(self.img_filepath, 'png') # Make csv input file self.csv_input_filepath = os.path.join(self.output_folder, 'input.csv') file_io.write_string_to_file( self.csv_input_filepath, '23.0,%s' % self.img_filepath) # Make schema file self.schema_filepath = os.path.join(self.output_folder, 'schema.json') file_io.write_string_to_file( self.schema_filepath, json.dumps([{'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}])) # Make features file self.features_filepath = os.path.join(self.output_folder, 'features.json') file_io.write_string_to_file( self.features_filepath, json.dumps({'num_col': {'transform': 'target'}, 'img_col': {'transform': 'img_url_to_vec'}})) # Run a local beam job to make the transform_fn with beam.Pipeline('DirectRunner'): with tft_impl.Context(temp_dir=os.path.join(self.output_folder, 'tmp')): def preprocessing_fn(inputs): return {'img_col': tft.map(tf.decode_base64, inputs['img_col']), 'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])} input_data = [{'img_col': base64.urlsafe_b64encode('abcd'), 'num_col': 3}] input_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.from_feature_spec( {'img_col': tf.FixedLenFeature(shape=[], dtype=tf.string), 'num_col': tf.FixedLenFeature(shape=[], dtype=tf.float32)})) (dataset, train_metadata), transform_fn = ( (input_data, input_metadata) | 'AnalyzeAndTransform' # noqa: W503 >> tft_impl.AnalyzeAndTransformDataset(preprocessing_fn)) # noqa: W503 # WriteTransformFn writes transform_fn and metadata _ = (transform_fn # noqa: F841 | 'WriteTransformFn' # noqa: W503 >> tft_beam_io.WriteTransformFn(self.output_folder)) # noqa: W503 metadata_io.write_metadata( metadata=input_metadata, path=os.path.join(self.output_folder, 'raw_metadata'))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "column_names" not in _kale_directory_file_names: raise ValueError("column_names" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "column_names" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "column_names" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] column_names = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) if "schema" not in _kale_directory_file_names: raise ValueError("schema" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "schema" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "schema" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] schema = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) def to_dense(tensor): """Takes as input a SparseTensor and return a Tensor with correct default value Args: tensor: tf.SparseTensor Returns: tf.Tensor with default value """ if not isinstance(tensor, tf.sparse.SparseTensor): return tensor if tensor.dtype == tf.string: default_value = '' elif tensor.dtype == tf.float32: default_value = 0.0 elif tensor.dtype == tf.int32: default_value = 0 else: raise ValueError(f"Tensor type not recognized: {tensor.dtype}") return tf.squeeze(tf.sparse_to_dense(tensor.indices, [tensor.dense_shape[0], 1], tensor.values, default_value=default_value), axis=1) # TODO: Update to below version # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1) def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize( to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs trns_output = os.path.join(DATA_DIR, "transformed") if os.path.exists(trns_output): shutil.rmtree(trns_output) tft_input_metadata = dataset_metadata.DatasetMetadata(schema) runner = 'DirectRunner' with beam.Pipeline(runner, options=None) as p: with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')): converter = CsvCoder(column_names, tft_input_metadata.schema) # READ TRAIN DATA train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1) | 'DecodeTrainData' >> beam.Map(converter.decode)) # TRANSFORM TRAIN DATA (and get transform_fn function) transformed_dataset, transform_fn = ( (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset # SAVE TRANSFORMED TRAIN DATA _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) # READ EVAL DATA eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1) | 'DecodeEvalData' >> beam.Map(converter.decode)) # TRANSFORM EVAL DATA (using previously created transform_fn function) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_data, transformed_metadata = ( (eval_dataset, transform_fn) | beam_impl.TransformDataset()) # SAVE EVAL DATA _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema)) # SAVE transform_fn FUNCTION FOR LATER USE # TODO: check out what is the transform function (transform_fn) that came from previous step _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(trns_output)) # SAVE TRANSFORMED METADATA metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(trns_output, 'metadata')) # -----------------------DATA SAVING START--------------------------------- if "trns_output" in locals(): _kale_resource_save(trns_output, os.path.join( _kale_data_directory, "trns_output")) else: print("_kale_resource_save: `trns_output` not found.")
def run_transform(output_dir, schema, train_data_file, eval_data_file, project, mode, preprocessing_fn=None): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list. train_data_file: training data file pattern. eval_data_file: eval data file pattern. project: the project to run dataflow in. local: whether the job should be local or cloud. preprocessing_fn: a function used to preprocess the raw data. If not specified, a function will be automatically inferred from the schema. """ tft_input_metadata = make_tft_input_metadata(schema) temp_dir = os.path.join(output_dir, 'tmp') preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': temp_dir, 'project': project, 'extra_packages': [ 'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz' ] } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: with beam_impl.Context(temp_dir=temp_dir): names = [x['name'] for x in schema] converter = CsvCoder(names, tft_input_metadata.schema) train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'DecodeTrainData' >> beam.Map(converter.decode)) train_dataset = (train_data, tft_input_metadata) transformed_dataset, transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir)) # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, 'metadata')) _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) eval_data = (p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'DecodeEvalData' >> beam.Map(converter.decode)) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_dataset = ((eval_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_eval_data, transformed_metadata = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema))
def expand(self, metadata): """A PTransform to write Metadata to disk.""" metadata_io.write_metadata(metadata, self._path)
def write_metadata_output(metadata): output_path = self._path if self._write_to_unique_subdirectory: output_path = common.get_unique_temp_path(self._path) metadata_io.write_metadata(metadata, output_path) return output_path