def input_fn(filenames, tf_transform_dir, batch_size=200): """Generates features and labels for training or evaluation. Args: filenames: [str] list of CSV files to read data from. tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. batch_size: int First dimension size of the Tensors returned by input_fn Returns: A (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ metadata_dir = os.path.join(tf_transform_dir, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_features = tf.contrib.learn.io.read_batch_features( filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) # We pop the label because we do not want to use it as a feature while we're # training. return transformed_features, transformed_features.pop( taxi.transformed_name(taxi.LABEL_KEY))
def input_fn(filenames, tf_transform_output, batch_size=200): """Generates features and labels for training or evaluation. Args: filenames: [str] list of CSV files to read data from. tf_transform_output: A TFTransformOutput. batch_size: int First dimension size of the Tensors returned by input_fn Returns: A (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) transformed_features = tf.contrib.learn.io.read_batch_features( filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) # We pop the label because we do not want to use it as a feature while we're # training. return transformed_features, transformed_features.pop( taxi.transformed_name(taxi.LABEL_KEY))
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def build_estimator(tf_transform_dir, config, hidden_units=None): """Build an estimator for predicting the tipping behavior of taxi riders. Args: tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. config: tf.contrib.learn.RunConfig defining the runtime environment for the estimator (including model_dir). hidden_units: [int], the layer sizes of the DNN (input layer first) Returns: Resulting DNNLinearCombinedClassifier. """ metadata_dir = os.path.join(tf_transform_dir, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_feature_spec.pop(taxi.transformed_name(taxi.LABEL_KEY)) real_valued_columns = [ tf.feature_column.numeric_column(key, shape=()) for key in taxi.transformed_names(taxi.DENSE_FLOAT_FEATURE_KEYS) ] categorical_columns = [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.VOCAB_SIZE + taxi.OOV_SIZE, default_value=0) for key in taxi.transformed_names(taxi.VOCAB_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.FEATURE_BUCKET_COUNT, default_value=0) for key in taxi.transformed_names(taxi.BUCKET_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip( taxi.transformed_names(taxi.CATEGORICAL_FEATURE_KEYS), # taxi.MAX_CATEGORICAL_FEATURE_VALUES) ] return tf.estimator.DNNLinearCombinedClassifier( config=config, linear_feature_columns=categorical_columns, dnn_feature_columns=real_valued_columns, dnn_hidden_units=hidden_units or [100, 70, 50, 25])
def eval_input_receiver_fn(tf_transform_dir, schema): """Build everything needed for the tf-model-analysis to run the model. Args: tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. schema: the schema of the input data. Returns: EvalInputReceiver function, which contains: - Tensorflow graph which parses raw untranformed features, applies the tf-transform preprocessing operators. - Set of raw, untransformed features. - Label against which predictions will be compared. """ # Notice that the inputs are raw features, not transformed features here. raw_feature_spec = taxi.get_raw_feature_spec(schema) serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[None], name='input_example_tensor') # Add a parse_example operator to the tensorflow graph, which will parse # raw, untransformed, tf examples. features = tf.parse_example(serialized_tf_example, raw_feature_spec) # Now that we have our raw examples, process them through the tf-transform # function computed during the preprocessing step. _, transformed_features = ( saved_transform_io.partially_apply_saved_transform( os.path.join(tf_transform_dir, transform_fn_io.TRANSFORM_FN_DIR), features)) # The key name MUST be 'examples'. receiver_tensors = {'examples': serialized_tf_example} # NOTE: Model is driven by transformed features (since training works on the # materialized output of TFT, but slicing will happen on raw features. features.update(transformed_features) return tfma.export.EvalInputReceiver( features=features, receiver_tensors=receiver_tensors, labels=transformed_features[taxi.transformed_name(taxi.LABEL_KEY)])
def build_estimator(tf_transform_output, config, hidden_units=None): """Build an estimator for predicting the tipping behavior of taxi riders. Args: tf_transform_output: A TFTransformOutput. config: tf.contrib.learn.RunConfig defining the runtime environment for the estimator (including model_dir). hidden_units: [int], the layer sizes of the DNN (input layer first) Returns: Resulting DNNLinearCombinedClassifier. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) transformed_feature_spec.pop(taxi.transformed_name(taxi.LABEL_KEY)) real_valued_columns = [ tf.feature_column.numeric_column(key, shape=()) for key in taxi.transformed_names(taxi.DENSE_FLOAT_FEATURE_KEYS) ] categorical_columns = [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.VOCAB_SIZE + taxi.OOV_SIZE, default_value=0) for key in taxi.transformed_names(taxi.VOCAB_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.FEATURE_BUCKET_COUNT, default_value=0) for key in taxi.transformed_names(taxi.BUCKET_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip( taxi.transformed_names(taxi.CATEGORICAL_FEATURE_KEYS), # taxi.MAX_CATEGORICAL_FEATURE_VALUES) ] return tf.estimator.DNNLinearCombinedClassifier( config=config, linear_feature_columns=categorical_columns, dnn_feature_columns=real_valued_columns, dnn_hidden_units=hidden_units or [100, 70, 50, 25])
def eval_input_receiver_fn(tf_transform_dir, schema): """Build everything needed for the tf-model-analysis to run the model. Args: tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. schema: the schema of the input data. Returns: EvalInputReceiver function, which contains: - Tensorflow graph which parses raw untranformed features, applies the tf-transform preprocessing operators. - Set of raw, untransformed features. - Label against which predictions will be compared. """ # Notice that the inputs are raw features, not transformed features here. raw_feature_spec = taxi.get_raw_feature_spec(schema) serialized_tf_example = tf.placeholder( dtype=tf.string, shape=[None], name='input_example_tensor') # Add a parse_example operator to the tensorflow graph, which will parse # raw, untransformed, tf examples. features = tf.parse_example(serialized_tf_example, raw_feature_spec) # Now that we have our raw examples, process them through the tf-transform # function computed during the preprocessing step. _, transformed_features = ( saved_transform_io.partially_apply_saved_transform( os.path.join(tf_transform_dir, transform_fn_io.TRANSFORM_FN_DIR), features)) # The key name MUST be 'examples'. receiver_tensors = {'examples': serialized_tf_example} # NOTE: Model is driven by transformed features (since training works on the # materialized output of TFT, but slicing will happen on raw features. features.update(transformed_features) return tfma.export.EvalInputReceiver( features=features, receiver_tensors=receiver_tensors, labels=transformed_features[taxi.transformed_name(taxi.LABEL_KEY)])
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2 Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: print('processing key', key) print('input:', inputs[key]) # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) # for key in taxi.FEATURE_NGRAM: # # Extract nggrams and build a vocab. # outputs[ # taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( # transform.ngrams( # tf.string_split(_fill_in_missing(inputs[key])), # ngram_range=taxi.NGRAM_RANGE, # separator=' '), # top_k=512, # num_oov_buckets=taxi.OOV_SIZE) for key in taxi.FEATURE_NGRAM: # Extract nggrams and build a vocab. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs