def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [ expected_boundaries, [2 * b for b in expected_boundaries] ] else: bucket_boundaries = tft.quantiles( x, num_buckets, epsilon, reduce_instance_dims=False) bucket_boundaries = tf.unstack(bucket_boundaries, axis=0) result = [] for i, boundaries in enumerate(bucket_boundaries): boundaries = tf.cast(boundaries, tf.float32) result.append( tft.apply_buckets(x[:, i], tf.expand_dims(boundaries, axis=0))) result = tf.stack(result, axis=1) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon, elementwise=True) return {'q_b': result}
def preprocess(input_features): output_features = {} output_features[metadata.TARGET_FEATURE_NAME] = input_features[metadata.TARGET_FEATURE_NAME] for feature_name in metadata.NUMERIC_FEATURE_NAMES: #output_features[feature_name+"_scaled"] = tft.scale_to_z_score(input_features[feature_name]) output_features[feature_name] = tft.scale_to_z_score(input_features[feature_name]) quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01) output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name], bucket_boundaries=quantiles) for feature_name in metadata.CATEGORICAL_FEATURE_NAMES: tft.uniques(input_features[feature_name], vocab_filename=feature_name) output_features[feature_name] = input_features[feature_name] # sba added this #output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name], #vocab_filename=feature_name) for feature_name in metadata.VOCAB_FEATURE_NAMES: output_features[feature_name +"_integerized"] = tft.string_to_int(input_features[feature_name],top_k=metadata.VOCAB_SIZE, num_oov_buckets=metadata.OOV_SIZE, vocab_filename=feature_name) return output_features
def preprocessing_fn(input_features): processed_features = {} for feature in raw_schema.feature: # Pass the target feature as is. if feature.name in [TARGET_FEATURE_NAME, WEIGHT_FEATURE_NAME]: processed_features[feature.name] = _prep( input_features[feature.name]) continue if feature.type == 1: # Extract vocabulary and integerize categorical features. processed_features[feature.name + "_integerized"] = _prep( tft.compute_and_apply_vocabulary(input_features[feature.name], vocab_filename=feature.name)) else: # normalize numeric features. processed_features[feature.name + "_scaled"] = _prep( tft.scale_to_z_score(input_features[feature.name])) # Bucketize age using quantiles. quantiles = tft.quantiles(input_features["age"], num_buckets=5, epsilon=0.01) processed_features["age_bucketized"] = _prep( tft.apply_buckets(input_features["age"], bucket_boundaries=quantiles)) return processed_features
def preprocessing_fn(inputs): x = tf.cast(inputs['x'], input_dtype) num_buckets = len(expected_boundaries) + 1 if should_apply: if is_manual_boundaries: bucket_boundaries = [expected_boundaries] else: bucket_boundaries = tft.quantiles(inputs['x'], num_buckets, epsilon) result = tft.apply_buckets(x, bucket_boundaries) else: result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon) return {'q_b': result}
def preprocessing_fn(inputs): """Preprocesses Titanic Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: mean_value = compute_mean_ignore_nan(inputs[key].values) absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}') outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing_with_impute(inputs[key], mean_value)) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE), num_oov_buckets=features.OOV_SIZE) for key in features.BUCKET_FEATURE_KEYS: if key in features.FEATURE_BUCKET_BOUNDARIES: bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key)) # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info) outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]), bucket_boundaries) else: outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT)) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs