def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. def convert_label(label): table = tf.contrib.lookup.index_table_from_tensor( ['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, outputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. dense = tf.sparse_to_dense(outputs[key].indices, [outputs[key].dense_shape[0], 1], outputs[key].values, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K']) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} DENSE_FLOAT_FEATURE_KEYS = [] VOCAB_FEATURE_KEYS = [] _CSV_COLUMNS_NAMES, _CSV_COLUMN_DEFAULTS, _CSV_COLUMN_types, _UNUSED = setcolumn_list_original( ) for i in range(len(_CSV_COLUMNS_NAMES)): if _CSV_COLUMN_types[i] is tf.string: VOCAB_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i]) outputs['gci'] = tf.expand_dims(_fill_in_missing(inputs['gci']), 1) for key in VOCAB_FEATURE_KEYS: if key in _UNUSED: continue if 'gci' in key: appendlist = tf.expand_dims(_fill_in_missing(inputs[key]), 1) outputs['gci'] = tf.concat([appendlist, outputs['gci']], 0) transform.vocabulary(outputs['gci'], vocab_filename='gci') transform.vocabulary(inputs['LAT_LON_10'], vocab_filename='label') return outputs
def preprocessing_fn(inputs): """Tftransform processing function""" tft.vocabulary(inputs["example_categ"], vocab_filename="example_categ") return { "context_num": tft.scale_to_0_1(inputs["context_num"]), "example_categ": inputs["example_categ"], "example_num": tft.scale_to_0_1(inputs["example_num"]), "label": inputs["label"] }
def transform_to_tfrecord(self, inputs): """Preprocess raw input columns into transformed columns.""" outputs = inputs.copy() for key in self.data_formatter.number_features: outputs[key] = tft.scale_to_z_score((outputs[key])) for key in self.data_formatter.vocabulary_features: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def preprocess_fn(inputs): """TensorFlow transform preprocessing function. Args: inputs: Dict of key to Tensor. Returns: Dict of key to transformed Tensor. """ outputs = inputs.copy() for key in CATEGORICAL_COLUMNS: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def preprocessing_fn_train(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] utterance = inputs['Utterance'] vocab = tf.concat([context, utterance], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) utterance_tokens = tf.compat.v1.string_split( utterance, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_train_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) print(mapped_context) mapped_utterance = tft.apply_vocabulary( utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Utterance': mapped_utterance, }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} for key in numerical_feats: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in categorical_feats + ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["label"] = inputs["label"] outputs["key"] = inputs["key"] return outputs
def preprocessing_fn(inputs): """Callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = inputs.copy() # Compute a vocabulary based on the TOP-K current pages and labels seen in # the dataset. vocab = tft.vocabulary(tf.concat( [inputs[_CUR_PAGE_FEATURE_KEY], inputs[_LABEL_KEY]], axis=0), top_k=_TOP_K, vocab_filename=_VOCAB_FILENAME) # Apply the vocabulary to both the current page feature and the label, # converting the strings into integers. for k in [_CUR_PAGE_FEATURE_KEY, _LABEL_KEY]: # Out-of-vocab strings will be assigned the _TOP_K value. outputs[k] = tft.apply_vocabulary(inputs[k], vocab, default_value=_TOP_K) return outputs
def preprocess_fn(inputs): """TensorFlow transform preprocessing function. Args: inputs: Dict of key to Tensor. Returns: Dict of key to transformed Tensor. """ outputs = inputs.copy() # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_COLUMNS: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def preprocessing_fn( inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: """tf.transform's callback function for preprocessing inputs. Parameters ---------- inputs: map from feature keys to raw not-yet-transformed features. Returns ------- Map from string feature key to transformed feature operations. """ outputs = {} for key in categorical_feature_keys + [label_key]: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) vocab_file_tensor = tft.vocabulary(outputs[_transformed_name(key)], vocab_filename=key) outputs[_transformed_name(key)] = tft.apply_vocabulary( outputs[_transformed_name(key)], vocab_file_tensor) # NOTE: This won't be correct in the incremental case since it's only using # the new examples to get the mean and variance. for key in numerical_feature_keys: outputs[_transformed_name(key)] = tf.expand_dims( tft.scale_to_z_score(_fill_in_missing(inputs[key])), axis=1) return outputs
def preprocessing_fn( inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: """tf.transform's callback function for preprocessing inputs. Parameters ---------- inputs: map from feature keys to raw not-yet-transformed features. Returns ------- Map from string feature key to transformed feature operations. """ outputs = {} outputs[_transformed_name(label_key)] = _fill_in_missing( inputs[label_key]) vocab_file_tensor = tft.vocabulary( outputs[_transformed_name(label_key)], vocab_filename=label_key) outputs[_transformed_name(label_key)] = tft.apply_vocabulary( outputs[_transformed_name(label_key)], vocab_file_tensor) outputs[_transformed_name(pixel_key)] = tf.concat( [_fill_in_missing(inputs[str(i + 1)]) for i in range(num_pixels)], axis=1) # NOTE: This won't be correct in the incremental case since it's only using # the new examples to get the mean and variance. outputs[_transformed_name('pixels')] = tft.scale_to_0_1( outputs[_transformed_name('pixels')]) return outputs
def _preprocessing_fn_for_common_optimize_traversal(inputs): _ = tft.vocabulary(inputs['s']) x = inputs['x'] x_mean = tft.mean(x, name='x') x_square_deviations = tf.square(x - x_mean) x_var = tft.mean(x_square_deviations, name='x_square_deviations') x_normalized = (x - x_mean) / tf.sqrt(x_var) return {'x_normalized': x_normalized}
def preprocessing_fn(inputs): """Transform preprocessing_fn.""" # generate a shared vocabulary. _ = tft.vocabulary(tf.concat([ inputs[features.QUERY_TOKENS].flat_values, inputs[features.DOCUMENT_TOKENS].flat_values ], axis=0), vocab_filename='shared_vocab') return inputs
def transform_to_tfrecord(self, inputs): """Preprocess raw input columns into transformed columns.""" outputs = inputs.copy() for key in enabled_number_features: outputs[key] = tft.scale_to_z_score((outputs[key])) # for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # # This is a SparseTensor because it is optional. Here we fill in a default # # value when it is missing. # dense = tf.sparse_to_dense(outputs[key].indices, # [outputs[key].dense_shape[0], 1], # outputs[key].values, default_value=0.) # # Reshaping from a batch of vectors of size 1 to a batch to scalars. # dense = tf.squeeze(dense, axis=1) # outputs[key] = tft.scale_to_0_1(dense) for key in enabled_vocabulary_features: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def _preprocessing_fn_with_table(inputs): x = inputs['x'] x_vocab = tft.vocabulary(x, name='x') initializer = tf.lookup.TextFileInitializer( x_vocab, key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER) table = tf.lookup.StaticHashTable(initializer, default_value=-1) x_integerized = table.lookup(x) return {'x_integerized': x_integerized}
def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary(inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs
def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), }
def _preprocess_tft(raw_data, user_freq, item_freq): """Creates vocabularies for users and items and maps their ids to ints. Args: raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}. user_freq: minimum frequency of a user to include it in the user vocab. item_freq: minimum frequency of an item to include it in the item vocab. Returns: A dict containing int ids cooresponding to a user_id and item_id and other features: {$user_key: $user_id, $item_key: $item_id, ...}. """ features = { feature: raw_data[feature] for feature in constants.BQ_FEATURES } item_vocab = tft.vocabulary(raw_data[constants.ITEM_KEY], vocab_filename=constants.ITEM_VOCAB_NAME, frequency_threshold=item_freq) tft_features = { constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.USER_KEY], vocab_filename=constants.USER_VOCAB_NAME, frequency_threshold=user_freq, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ITEM_KEY: tft.apply_vocabulary(raw_data[constants.ITEM_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.ARTIST_KEY], vocab_filename=constants.ARTIST_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.TAGS_KEY], vocab_filename=constants.TAG_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TOP_10_KEY: tft.apply_vocabulary(raw_data[constants.TOP_10_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), } features.update(tft_features) return features
def preprocessing_fn(inputs): logging.info("Running preprocessing function") config = ClassificationConfig.from_env() outputs = dict() read_image_blob = lambda x: read_tensor_from_image_file( x, input_height=config.image_height, input_width=config.image_width) # image tensor outputs[config.image_key] = tf.compat.v2.map_fn( read_image_blob, inputs[config.raw_image_key].values, dtype=tf.float32) # label tensor # we store input to output and create a vocabulary to be used later on _ = tft.vocabulary(inputs['label'], vocab_filename="label_encoder") outputs[config.label_key] = inputs[config.raw_label_key] return outputs
def _preprocessing_fn_with_packable_analyzer_single_phase(inputs): x, y = inputs['x'], inputs['y'] x_mean = tft.mean(x, name='x') x_centered = x - x_mean y_mean = tft.mean(y, name='y') y_centered = y - y_mean z = inputs['z'] z_vocab = tft.vocabulary(z, name='z') initializer = tf.lookup.TextFileInitializer( z_vocab, key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER) table = tf.lookup.StaticHashTable(initializer, default_value=-1) z_integerized = table.lookup(z) return {'x_centered': x_centered, 'y_centered': y_centered, 'z_integerized': z_integerized}
def _default_preprocessing_fn(inputs, input_features): outputs = {} for key in input_features["numerical_default_encoding"]: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 for key in input_features["categorical_default_encoding"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) if "label" in input_features: outputs["label"] = inputs[input_features["label"]] return outputs
def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1') _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 's_integerized': tft.compute_and_apply_vocabulary( inputs['s'], labels=inputs['label'], use_adjusted_mutual_info=True), }
def preprocessing_fn(inputs): """ Preprocess data inputs. This a callback function for tfx.components.Transform Parameters ---------- inputs : dict, tensorflow_transform data Data beging fed into tfx.components.Transform Map from feature keys to raw not-yet-transformed features. Returns ------- output: dict Map from string feature key to transformed feature operations. """ # String to integer indexing content = inputs["InSeasonSeries_Id"] token = inputs["token"] token_count = inputs["token_count"] vocab_uri = tft.vocabulary( tf.concat([content, token], axis=0), vocab_filename="node_vocab.txt", name="node_vocab", ) # Logging logging.info(f"graph vocabulary uri: {vocab_uri}") # output as a dict output = {} output["InSeasonSeries_Id"] = tft.apply_vocabulary( content, deferred_vocab_filename_tensor=vocab_uri, default_value=-1) output["token"] = tft.apply_vocabulary( token, deferred_vocab_filename_tensor=vocab_uri, default_value=-1) output["weight"] = tf.constant([1.0], dtype="float32") / tf.cast( token_count, "float32") return output
def preprocessing_fn(inputs, input_features): """Preprocess input columns into transformed columns.""" outputs = _default_preprocessing_fn(inputs, input_features) outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["key"] = inputs["key"] return outputs
def preprocessing_fn_test(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] ground_truth_utterance = inputs['Ground Truth Utterance'] distractor_0 = inputs['Distractor_0'] distractor_1 = inputs['Distractor_1'] distractor_2 = inputs['Distractor_2'] distractor_3 = inputs['Distractor_3'] distractor_4 = inputs['Distractor_4'] distractor_5 = inputs['Distractor_5'] distractor_6 = inputs['Distractor_6'] distractor_7 = inputs['Distractor_7'] distractor_8 = inputs['Distractor_8'] vocab = tf.concat([ context, ground_truth_utterance, distractor_0, distractor_1, distractor_2, distractor_3, distractor_4, distractor_5, distractor_6, distractor_7, distractor_8 ], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) ground_truth_utterance_tokens = tf.compat.v1.string_split( ground_truth_utterance, DELIMITERS) distractor_0_tokens = tf.compat.v1.string_split( distractor_0, DELIMITERS) distractor_1_tokens = tf.compat.v1.string_split( distractor_1, DELIMITERS) distractor_2_tokens = tf.compat.v1.string_split( distractor_2, DELIMITERS) distractor_3_tokens = tf.compat.v1.string_split( distractor_3, DELIMITERS) distractor_4_tokens = tf.compat.v1.string_split( distractor_4, DELIMITERS) distractor_5_tokens = tf.compat.v1.string_split( distractor_5, DELIMITERS) distractor_6_tokens = tf.compat.v1.string_split( distractor_6, DELIMITERS) distractor_7_tokens = tf.compat.v1.string_split( distractor_7, DELIMITERS) distractor_8_tokens = tf.compat.v1.string_split( distractor_8, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_test_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_ground_truth_utterance = tft.apply_vocabulary( ground_truth_utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_0 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_1 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_2 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_3 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_4 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_5 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_6 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_7 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_8 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Ground Truth Utterance': mapped_ground_truth_utterance, 'Distractor_0': mapped_distractor_0, 'Distractor_1': mapped_distractor_1, 'Distractor_2': mapped_distractor_2, 'Distractor_3': mapped_distractor_3, 'Distractor_4': mapped_distractor_4, 'Distractor_5': mapped_distractor_5, 'Distractor_6': mapped_distractor_6, 'Distractor_7': mapped_distractor_7, 'Distractor_8': mapped_distractor_8, }
def _preprocessing_fn_with_table(inputs): x = inputs['x'] x_vocab = tft.vocabulary(x, name='x') table = tf.contrib.lookup.index_table_from_file(x_vocab) x_integerized = table.lookup(x) return {'x_integerized': x_integerized}