def preprocessing_fn(input_features): # get the text of clean_title text = input_features['clean_title'] # extract embeddings using tf.hub embeddings = tft.apply_function(get_embeddings, text) # tokenize text text_tokens = tf.string_split(text, parameters.DELIMITERS) # bag of words (bow) indices text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE) # tf.idf bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1) output_features = dict() output_features['topic'] = input_features['topic'] output_features['title'] = input_features['raw_title'] output_features['bow'] = bag_of_words_indices output_features['tf_idf'] = tf_idf output_features['embeddings'] = embeddings return output_features
def preprocessing_fn(inputs): text_fields = [] # Keep the original data and add more to it. result = inputs.copy() # Figure out the vocabulary for our text fields. for field_name in text_fields: field = inputs[field_name] tokens = tf.strings.split(text, " ") bag_of_words = tft.bag_of_words(tokens, range(1,3), seperator=" ") indices = tft.compute_and_apply_vocabulary(bag_of_words) bow_indices, weights = tft.tfidf(line_indices) outputs[f"{field_name}_bow_indices"] = bow_indices outputs[f"{field_name}_weight"] weights return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] }
def preprocessing_fn(inputs): """TFT preprocessing function. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ features_dict = {} for col_schema in schema: col_name = col_schema['name'] if col_schema['type'] == 'NUMBER': features_dict[col_name] = inputs[col_name] elif col_schema['type'] == 'CATEGORY': features_dict[col_name] = tft.string_to_int( inputs[col_name], vocab_filename='vocab_' + col_name) elif col_schema['type'] == 'TEXT': tokens = tf.string_split(inputs[col_name], DELIMITERS) # TODO: default_value = 0 is wrong. It means OOV gets 0 for their index. # But this is to workaround the issue that trainer can use the true vocab # size. Otherwise trainer has to use VOCAB_SIZE defined in this file which # is too large. I am talking to TFT folks on this. If there is no workaround, # user has to provide a vocab_size. indices = tft.string_to_int(tokens, vocab_filename='vocab_' + col_name, default_value=0) # Add one for the oov bucket created by string_to_int. bow_indices, bow_weights = tft.tfidf(indices, VOCAB_SIZE + 1) features_dict[col_name + '_indices'] = bow_indices features_dict[col_name + '_weights'] = bow_weights elif col_schema['type'] == 'IMAGE_URL': features_dict[col_name] = tft.apply_function_with_checkpoint( _image_to_vec, [inputs[col_name]], INCEPTION_V3_CHECKPOINT, exclude=INCEPTION_EXCLUDED_VARIABLES) elif col_schema['type'] == 'KEY': features_dict[col_name] = inputs[col_name] else: raise ValueError('Invalid schema. Unknown type ' + col_schema['type']) return features_dict