def preprocess_fn(input_features): import tensorflow_transform as tft title_embed = tft.apply_function(get_embed_content, input_features['content']) content_embed = tft.apply_function(get_embed_title, input_features['title']) output_features = { 'topics': input_features['topics'], 'title': input_features['title'], 'content': input_features['content'], 'title_embed': title_embed, 'content_embed': content_embed, } return output_features
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.uniques(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, outputs[LABEL_KEY]) return outputs
def preprocessing_fn(input_features): # get the text of clean_title text = input_features['clean_title'] # extract embeddings using tf.hub embeddings = tft.apply_function(get_embeddings, text) # tokenize text text_tokens = tf.string_split(text, parameters.DELIMITERS) # bag of words (bow) indices text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE) # tf.idf bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1) output_features = dict() output_features['topic'] = input_features['topic'] output_features['title'] = input_features['raw_title'] output_features['bow'] = bag_of_words_indices output_features['tf_idf'] = tf_idf output_features['embeddings'] = embeddings return output_features
def pre_processing_fun(inputs): outputs = {} for fea in NUMERIC_FEATURE_KEYS: outputs[fea] = tft.scale_to_0_1(inputs[fea]) for fea in CATEGORICAL_FEATURE_KEYS: outputs[fea] = tft.string_to_int(inputs[fea]) def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[taxi.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[taxi.LABEL_KEY] = transform.apply_function( convert_label, inputs[taxi.LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key+'_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key] ) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int( inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key + '_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key]) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """User defined preprocessing function for movielens columns. Args: inputs: a `dict` that maps EXAMPLE_COLUMNS to the corresponding Tensor/SparseTensor. Returns: A `dict` that maps EXAMPLE_COLUMNS to the transformed Tensor/SparseTensor. """ result = {column_name: inputs[column_name] for column_name in EXAMPLE_COLUMNS} rating_max = tft.max(inputs[QUERY_RATED_MOVIE_SCORES].values) rating_min = tft.min(inputs[QUERY_RATED_MOVIE_SCORES].values) def scale_sparse_values(x, min_value, max_value): """0-1 normalization of the values of a SparseTensor. Args: x: a input sparse tensor. min_value: minimum value for x.values. max_value: maximum value for x.values. Returns: A sparse tensor y such as that y.values is the result of 0-1 normalization of x.values. """ scaled_values = (x.values - min_value) / (max_value - min_value) return tf.SparseTensor(indices=x.indices, values=scaled_values, dense_shape=x.dense_shape) result[QUERY_RATED_MOVIE_SCORES] = scale_sparse_values( inputs[QUERY_RATED_MOVIE_SCORES], rating_min, rating_max) genre_vocab = tft.uniques(tf.concat( [inputs[QUERY_RATED_GENRE_IDS].values, inputs[CANDIDATE_GENRE_IDS].values], 0)) movie_vocab = tft.uniques(tf.concat( [inputs[QUERY_RATED_MOVIE_IDS].values, inputs[CANDIDATE_MOVIE_ID].values, inputs[RANKING_CANDIDATE_MOVIE_IDS].values], 0)) def map_to_int(x, vocabulary_or_file): """Maps string tensor into indexes using vocab. Args: x : a Tensor/SparseTensor of string. vocabulary_or_file: a Tensor/SparseTensor containing unique string values within x or a single value for the file where the vocabulary is stored. Returns: A Tensor/SparseTensor of indexes (int) of the same shape as x. """ # TODO(b/62489180): Remove this workaround once TFT 0.2.0 is released. if hasattr(impl, '_asset_files_supported') and impl._asset_files_supported(): # pylint: disable=protected-access table = tf.contrib.lookup.string_to_index_table_from_file( vocabulary_file=vocabulary_or_file, num_oov_buckets=1) else: table = tf.contrib.lookup.string_to_index_table_from_tensor( mapping=vocabulary_or_file, num_oov_buckets=1) return table.lookup(x) result[QUERY_RATED_GENRE_IDS] = tft.apply_function( map_to_int, inputs[QUERY_RATED_GENRE_IDS], genre_vocab) result[CANDIDATE_GENRE_IDS] = tft.apply_function( map_to_int, inputs[CANDIDATE_GENRE_IDS], genre_vocab) result[QUERY_RATED_MOVIE_IDS] = tft.apply_function( map_to_int, inputs[QUERY_RATED_MOVIE_IDS], movie_vocab) result[CANDIDATE_MOVIE_ID] = tft.apply_function( map_to_int, inputs[CANDIDATE_MOVIE_ID], movie_vocab) result[RANKING_CANDIDATE_MOVIE_IDS] = tft.apply_function( map_to_int, inputs[RANKING_CANDIDATE_MOVIE_IDS], movie_vocab) return result
def preprocess_fn(input_features): import tensorflow_transform as tft embedding = tft.apply_function(embed_text, input_features['text']) output_features = {'id': input_features['id'], 'embedding': embedding} return output_features
def preprocessing_fn(inputs): """User defined preprocessing function for movielens columns. Args: inputs: a `dict` that maps EXAMPLE_COLUMNS to the corresponding Tensor/SparseTensor. Returns: A `dict` that maps EXAMPLE_COLUMNS to the transformed Tensor/SparseTensor. """ result = { column_name: inputs[column_name] for column_name in EXAMPLE_COLUMNS } rating_max = tft.max(inputs[QUERY_RATED_MOVIE_SCORES].values) rating_min = tft.min(inputs[QUERY_RATED_MOVIE_SCORES].values) def scale_sparse_values(x, min_value, max_value): """0-1 normalization of the values of a SparseTensor. Args: x: a input sparse tensor. min_value: minimum value for x.values. max_value: maximum value for x.values. Returns: A sparse tensor y such as that y.values is the result of 0-1 normalization of x.values. """ scaled_values = (x.values - min_value) / (max_value - min_value) return tf.SparseTensor(indices=x.indices, values=scaled_values, dense_shape=x.dense_shape) result[QUERY_RATED_MOVIE_SCORES] = scale_sparse_values( inputs[QUERY_RATED_MOVIE_SCORES], rating_min, rating_max) genre_vocab = tft.uniques( tf.concat([ inputs[QUERY_RATED_GENRE_IDS].values, inputs[CANDIDATE_GENRE_IDS].values ], 0)) movie_vocab = tft.uniques( tf.concat([ inputs[QUERY_RATED_MOVIE_IDS].values, inputs[CANDIDATE_MOVIE_ID].values, inputs[RANKING_CANDIDATE_MOVIE_IDS].values ], 0)) def map_to_int(x, vocabulary_or_file): """Maps string tensor into indexes using vocab. Args: x : a Tensor/SparseTensor of string. vocabulary_or_file: a Tensor/SparseTensor containing unique string values within x or a single value for the file where the vocabulary is stored. Returns: A Tensor/SparseTensor of indexes (int) of the same shape as x. """ # TODO(b/62489180): Remove this workaround once TFT 0.2.0 is released. if hasattr(impl, '_asset_files_supported' ) and impl._asset_files_supported(): # pylint: disable=protected-access table = tf.contrib.lookup.string_to_index_table_from_file( vocabulary_file=vocabulary_or_file, num_oov_buckets=1) else: table = tf.contrib.lookup.string_to_index_table_from_tensor( mapping=vocabulary_or_file, num_oov_buckets=1) return table.lookup(x) result[QUERY_RATED_GENRE_IDS] = tft.apply_function( map_to_int, inputs[QUERY_RATED_GENRE_IDS], genre_vocab) result[CANDIDATE_GENRE_IDS] = tft.apply_function( map_to_int, inputs[CANDIDATE_GENRE_IDS], genre_vocab) result[QUERY_RATED_MOVIE_IDS] = tft.apply_function( map_to_int, inputs[QUERY_RATED_MOVIE_IDS], movie_vocab) result[CANDIDATE_MOVIE_ID] = tft.apply_function( map_to_int, inputs[CANDIDATE_MOVIE_ID], movie_vocab) result[RANKING_CANDIDATE_MOVIE_IDS] = tft.apply_function( map_to_int, inputs[RANKING_CANDIDATE_MOVIE_IDS], movie_vocab) return result