def preprocessing_fn_train(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] utterance = inputs['Utterance'] vocab = tf.concat([context, utterance], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) utterance_tokens = tf.compat.v1.string_split( utterance, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_train_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) print(mapped_context) mapped_utterance = tft.apply_vocabulary( utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Utterance': mapped_utterance, }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} for key in numerical_feats: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in categorical_feats + ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["label"] = inputs["label"] outputs["key"] = inputs["key"] return outputs
def preprocessing_fn(inputs): """Callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = inputs.copy() # Compute a vocabulary based on the TOP-K current pages and labels seen in # the dataset. vocab = tft.vocabulary(tf.concat( [inputs[_CUR_PAGE_FEATURE_KEY], inputs[_LABEL_KEY]], axis=0), top_k=_TOP_K, vocab_filename=_VOCAB_FILENAME) # Apply the vocabulary to both the current page feature and the label, # converting the strings into integers. for k in [_CUR_PAGE_FEATURE_KEY, _LABEL_KEY]: # Out-of-vocab strings will be assigned the _TOP_K value. outputs[k] = tft.apply_vocabulary(inputs[k], vocab, default_value=_TOP_K) return outputs
def preprocessing_fn( inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: """tf.transform's callback function for preprocessing inputs. Parameters ---------- inputs: map from feature keys to raw not-yet-transformed features. Returns ------- Map from string feature key to transformed feature operations. """ outputs = {} for key in categorical_feature_keys + [label_key]: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) vocab_file_tensor = tft.vocabulary(outputs[_transformed_name(key)], vocab_filename=key) outputs[_transformed_name(key)] = tft.apply_vocabulary( outputs[_transformed_name(key)], vocab_file_tensor) # NOTE: This won't be correct in the incremental case since it's only using # the new examples to get the mean and variance. for key in numerical_feature_keys: outputs[_transformed_name(key)] = tf.expand_dims( tft.scale_to_z_score(_fill_in_missing(inputs[key])), axis=1) return outputs
def preprocessing_fn( inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: """tf.transform's callback function for preprocessing inputs. Parameters ---------- inputs: map from feature keys to raw not-yet-transformed features. Returns ------- Map from string feature key to transformed feature operations. """ outputs = {} outputs[_transformed_name(label_key)] = _fill_in_missing( inputs[label_key]) vocab_file_tensor = tft.vocabulary( outputs[_transformed_name(label_key)], vocab_filename=label_key) outputs[_transformed_name(label_key)] = tft.apply_vocabulary( outputs[_transformed_name(label_key)], vocab_file_tensor) outputs[_transformed_name(pixel_key)] = tf.concat( [_fill_in_missing(inputs[str(i + 1)]) for i in range(num_pixels)], axis=1) # NOTE: This won't be correct in the incremental case since it's only using # the new examples to get the mean and variance. outputs[_transformed_name('pixels')] = tft.scale_to_0_1( outputs[_transformed_name('pixels')]) return outputs
def _preprocess_tft(raw_data, user_freq, item_freq): """Creates vocabularies for users and items and maps their ids to ints. Args: raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}. user_freq: minimum frequency of a user to include it in the user vocab. item_freq: minimum frequency of an item to include it in the item vocab. Returns: A dict containing int ids cooresponding to a user_id and item_id and other features: {$user_key: $user_id, $item_key: $item_id, ...}. """ features = { feature: raw_data[feature] for feature in constants.BQ_FEATURES } item_vocab = tft.vocabulary(raw_data[constants.ITEM_KEY], vocab_filename=constants.ITEM_VOCAB_NAME, frequency_threshold=item_freq) tft_features = { constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.USER_KEY], vocab_filename=constants.USER_VOCAB_NAME, frequency_threshold=user_freq, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ITEM_KEY: tft.apply_vocabulary(raw_data[constants.ITEM_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.ARTIST_KEY], vocab_filename=constants.ARTIST_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.TAGS_KEY], vocab_filename=constants.TAG_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TOP_10_KEY: tft.apply_vocabulary(raw_data[constants.TOP_10_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), } features.update(tft_features) return features
def preprocessing_fn(inputs): """ Preprocess data inputs. This a callback function for tfx.components.Transform Parameters ---------- inputs : dict, tensorflow_transform data Data beging fed into tfx.components.Transform Map from feature keys to raw not-yet-transformed features. Returns ------- output: dict Map from string feature key to transformed feature operations. """ # String to integer indexing content = inputs["InSeasonSeries_Id"] token = inputs["token"] token_count = inputs["token_count"] vocab_uri = tft.vocabulary( tf.concat([content, token], axis=0), vocab_filename="node_vocab.txt", name="node_vocab", ) # Logging logging.info(f"graph vocabulary uri: {vocab_uri}") # output as a dict output = {} output["InSeasonSeries_Id"] = tft.apply_vocabulary( content, deferred_vocab_filename_tensor=vocab_uri, default_value=-1) output["token"] = tft.apply_vocabulary( token, deferred_vocab_filename_tensor=vocab_uri, default_value=-1) output["weight"] = tf.constant([1.0], dtype="float32") / tf.cast( token_count, "float32") return output
def _default_preprocessing_fn(inputs, input_features): outputs = {} for key in input_features["numerical_default_encoding"]: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 for key in input_features["categorical_default_encoding"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) if "label" in input_features: outputs["label"] = inputs[input_features["label"]] return outputs
def preprocessing_fn(inputs, input_features): """Preprocess input columns into transformed columns.""" outputs = _default_preprocessing_fn(inputs, input_features) outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["key"] = inputs["key"] return outputs
def apply_vocab_fn(inputs): """Preprocessing fn for sparse features. Applies vocab to bucketize sparse features. This function operates using previously-created vocab files. Pre-condition: Full vocab has been materialized. Args: inputs: Input features to transform. Returns: Output dict with transformed features. """ outputs = {} outputs[LABEL_KEY] = inputs[LABEL_KEY] for key in NUMERIC_FEATURE_KEYS: outputs[key] = inputs[key] for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS): vocab_fn = os.path.join(args.temp_dir, "tftransform_tmp", "feature_{}_vocab".format(idx)) outputs[key] = tft.apply_vocabulary(inputs[key], vocab_fn) return outputs
def preprocessing_fn_test(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] ground_truth_utterance = inputs['Ground Truth Utterance'] distractor_0 = inputs['Distractor_0'] distractor_1 = inputs['Distractor_1'] distractor_2 = inputs['Distractor_2'] distractor_3 = inputs['Distractor_3'] distractor_4 = inputs['Distractor_4'] distractor_5 = inputs['Distractor_5'] distractor_6 = inputs['Distractor_6'] distractor_7 = inputs['Distractor_7'] distractor_8 = inputs['Distractor_8'] vocab = tf.concat([ context, ground_truth_utterance, distractor_0, distractor_1, distractor_2, distractor_3, distractor_4, distractor_5, distractor_6, distractor_7, distractor_8 ], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) ground_truth_utterance_tokens = tf.compat.v1.string_split( ground_truth_utterance, DELIMITERS) distractor_0_tokens = tf.compat.v1.string_split( distractor_0, DELIMITERS) distractor_1_tokens = tf.compat.v1.string_split( distractor_1, DELIMITERS) distractor_2_tokens = tf.compat.v1.string_split( distractor_2, DELIMITERS) distractor_3_tokens = tf.compat.v1.string_split( distractor_3, DELIMITERS) distractor_4_tokens = tf.compat.v1.string_split( distractor_4, DELIMITERS) distractor_5_tokens = tf.compat.v1.string_split( distractor_5, DELIMITERS) distractor_6_tokens = tf.compat.v1.string_split( distractor_6, DELIMITERS) distractor_7_tokens = tf.compat.v1.string_split( distractor_7, DELIMITERS) distractor_8_tokens = tf.compat.v1.string_split( distractor_8, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_test_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_ground_truth_utterance = tft.apply_vocabulary( ground_truth_utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_0 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_1 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_2 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_3 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_4 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_5 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_6 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_7 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_8 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Ground Truth Utterance': mapped_ground_truth_utterance, 'Distractor_0': mapped_distractor_0, 'Distractor_1': mapped_distractor_1, 'Distractor_2': mapped_distractor_2, 'Distractor_3': mapped_distractor_3, 'Distractor_4': mapped_distractor_4, 'Distractor_5': mapped_distractor_5, 'Distractor_6': mapped_distractor_6, 'Distractor_7': mapped_distractor_7, 'Distractor_8': mapped_distractor_8, }