def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs
def _tokenize_review(review):
  """Tokenize the reviews by spliting the reviews.

  Constructing a vocabulary. Map the words to their frequency index in the
  vocabulary.

  Args:
    review: tensors containing the reviews. (batch_size/None, 1)

  Returns:
    Tokenized and padded review tensors. (batch_size/None, _MAX_LEN)
  """
  review_sparse = tf.strings.split(tf.reshape(review, [-1])).to_sparse()
  # tft.apply_vocabulary doesn't reserve 0 for oov words. In order to comply
  # with convention and use mask_zero in keras.embedding layer, set oov value
  # to _VOCAB_SIZE and padding value to -1. Then add 1 to all the tokens.
  review_indices = tft.compute_and_apply_vocabulary(
      review_sparse, default_value=_VOCAB_SIZE, top_k=_VOCAB_SIZE)
  dense = tf.sparse.to_dense(review_indices, default_value=-1)
  # TFX transform expects the transform result to be FixedLenFeature.
  padding_config = [[0, 0], [0, _MAX_LEN]]
  dense = tf.pad(dense, padding_config, 'CONSTANT', -1)
  padded = tf.slice(dense, [0, 0], [-1, _MAX_LEN])
  padded += 1
  return padded
def preprocessing_fn(inputs):
    """Preprocesses Covertype Dataset.
    
    Scales numerical features and generates vocabularies
    and mappings for categorical features.
    
    Args:
        inputs: A map from feature keys to raw not-yet-transformed features
        
    Returns:
        A map from transformed feature keys to transformation operations
    """

    outputs = {}

    # Scale numerical features
    for key in NUMERIC_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    # Generate vocabularies and maps categorical features
    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            x=_fill_in_missing(inputs[key]),
            num_oov_buckets=1,
            vocab_filename=key)

    # Convert Cover_Type from 1-7 to 0-6
    outputs[_transformed_name(LABEL_KEY)] = _fill_in_missing(
        inputs[LABEL_KEY]) - 1

    return outputs
def compute_vocab_fn(inputs):
    """Preprocessing fn for sparse features.

  This function computes unique IDs for the sparse features. We rely on implicit
  behavior which writes the vocab files to the vocab_filename specified in
  tft.compute_and_apply_vocabulary.

  Pre-condition: Sparse features have been converted to integer and mod'ed with
  MAX_IND_RANGE.

  Args:
    inputs: Input features to transform.

  Returns:
    Output dict with transformed features.
  """
    outputs = {}

    outputs[LABEL_KEY] = inputs[LABEL_KEY]
    for key in NUMERIC_FEATURE_KEYS:
        outputs[key] = inputs[key]
    for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS):
        outputs[key] = tft.compute_and_apply_vocabulary(
            x=inputs[key], vocab_filename="feature_{}_vocab".format(idx))

    return outputs
def _preprocess_tft(raw_data, user_freq, item_freq):
    """Creates vocabularies for users and items and maps their ids to ints.

  Args:
    raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}.
    user_freq: minimum frequency of a user to include it in the user vocab.
    item_freq: minimum frequency of an item to include it in the item vocab.

  Returns:
    A dict containing int ids cooresponding to a user_id and item_id and other
      features: {$user_key: $user_id, $item_key: $item_id, ...}.
  """
    features = {
        feature: raw_data[feature]
        for feature in constants.BQ_FEATURES
    }
    item_vocab = tft.vocabulary(raw_data[constants.ITEM_KEY],
                                vocab_filename=constants.ITEM_VOCAB_NAME,
                                frequency_threshold=item_freq)
    tft_features = {
        constants.TFT_USER_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.USER_KEY],
            vocab_filename=constants.USER_VOCAB_NAME,
            frequency_threshold=user_freq,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_ITEM_KEY:
        tft.apply_vocabulary(raw_data[constants.ITEM_KEY],
                             item_vocab,
                             default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_ARTIST_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.ARTIST_KEY],
            vocab_filename=constants.ARTIST_VOCAB_NAME,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_TAGS_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.TAGS_KEY],
            vocab_filename=constants.TAG_VOCAB_NAME,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_TOP_10_KEY:
        tft.apply_vocabulary(raw_data[constants.TOP_10_KEY],
                             item_vocab,
                             default_value=constants.TFT_DEFAULT_ID),
    }
    features.update(tft_features)
    return features
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            sparse = tf.sparse.SparseTensor(inputs[key].indices,
                                            inputs[key].values,
                                            [inputs[key].dense_shape[0], 1])
            dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary, and convert the string feature to a one-hot encoding.
        for key in CATEGORICAL_FEATURE_KEYS:
            integerized = tft.compute_and_apply_vocabulary(
                tf.strings.strip(inputs[key]),
                num_oov_buckets=NUM_OOV_BUCKETS,
                vocab_filename=key)
            depth = (tft.experimental.get_vocabulary_size_by_name(key) +
                     NUM_OOV_BUCKETS)
            one_hot_encoded = tf.one_hot(integerized,
                                         depth=tf.cast(depth, tf.int32),
                                         on_value=1.0,
                                         off_value=0.0)
            # This output is now one-hot encoded. If saving transformed data to disk,
            # this can incur significant memory cost.
            outputs[key] = tf.reshape(one_hot_encoded, [-1, depth])

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        with tf.init_scope():
            initializer = tf.lookup.KeyValueTensorInitializer(
                keys=table_keys,
                values=tf.cast(tf.range(len(table_keys)), tf.int64),
                key_dtype=tf.string,
                value_dtype=tf.int64)
            table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        # Remove trailing periods for test data when the data is read with tf.data.
        label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '')
        label_str = tf.strings.strip(label_str)
        data_labels = table.lookup(label_str)
        transformed_label = tf.one_hot(indices=data_labels,
                                       depth=len(table_keys),
                                       on_value=1.0,
                                       off_value=0.0)
        outputs[LABEL_KEY] = tf.reshape(transformed_label,
                                        [-1, len(table_keys)])

        return outputs
Exemple #7
0
def preprocessing_fn(inputs):
    #end::entry_point[]
    #tag::logic[]
    outputs = {}
    # TFT business logic goes here
    outputs["body_stuff"] = tft.compute_and_apply_vocabulary(inputs["body"],
                                                             top_k=1000)
    return outputs
Exemple #8
0
    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor,
                vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE,
                num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(to_dense(inputs[key]),
                                         FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(
                    tf.as_string(outputs[key]), vocab_filename='vocab_' + key)

        return outputs
 def preprocess(inputs):  # inputs is a batch of input features
     median_age = inputs["housing_median_age"]
     ocean_proximity = inputs["ocean_proximity"]
     standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))
     ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
     return {
         "standardized_median_age": standardized_age,
         "ocean_proximity_id": ocean_proximity_id
     }
Exemple #10
0
def preprocessing_fn(inputs: Dict[str, Tensor],
                     custom_config=Dict[str, Any]) -> Dict[str, Tensor]:
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    custom_config: Custom configuration dictionary for passing the task's
      ProblemStatement as a text proto, since custom_config must be
      JSON-serializable.

  Returns:
    Map from string feature key to transformed feature operations.
  """

    problem_statement = ps_pb2.ProblemStatement()
    text_format.Parse(
        text=custom_config[BasicPreprocessor.PROBLEM_STATEMENT_KEY],
        message=problem_statement)

    outputs = {}
    for key in [k for k, v in inputs.items() if v.dtype == tf.float32]:
        # TODO(weill): Handle case when an int field can actually represents numeric
        # rather than categorical values.
        task_type = problem_statement.tasks[0].type
        if task_type.HasField('one_dimensional_regression') and (
                key == task_type.one_dimensional_regression.label):
            outputs[key] = inputs[key]
            # Skip normalizing regression tasks.
            continue

        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_sanitize_feature_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in [k for k, v in inputs.items() if v.dtype != tf.float32]:
        # Build a vocabulary for this feature.
        # TODO(weill): Risk here to blow up computation needlessly.
        output = tft.compute_and_apply_vocabulary(_fill_in_missing(
            inputs[key]),
                                                  top_k=None,
                                                  num_oov_buckets=1)

        # Don't sanitize the label key name.
        task_type = problem_statement.tasks[0].type
        if task_type.HasField('multi_class_classification') and (
                key == task_type.multi_class_classification.label):
            outputs[key] = output
            continue
        if task_type.HasField('binary_classification') and (
                key == task_type.binary_classification.label):
            outputs[key] = output
            continue

        # Do sanitize feature key names.
        outputs[_sanitize_feature_name(key)] = output

    return outputs
def _preprocessing_fn(inputs, integer_label: bool = False):
    """TensorFlow Transform preprocessing function."""

    outputs = inputs.copy()

    if not integer_label:
        # Integerize string labels, if present.
        outputs[constants.LABEL_KEY] = tft.compute_and_apply_vocabulary(
            outputs[constants.LABEL_KEY])

    return outputs
def _preprocessing_fn(inputs: Dict[str, Any],
                      schema_map: Dict[str, collections.namedtuple]):
  """TensorFlow Transform preprocessing function."""

  outputs = {}
  for name, supported_type in schema_map.items():
    if supported_type.type_name == 'string_label':
      outputs[name] = tft.compute_and_apply_vocabulary(inputs[name])
    else:
      outputs[name] = inputs[name]
  return outputs
Exemple #13
0
def preprocessing_fn(inputs):
    """ This is the preprocessing functions use by the tensorflow transform 
    Paramters:
        inputs -- the tensorflow parset input tensors in a dict, defined by the metadata input
    Returns:
        inputs -- dict wit the now appended output values, the added word representation is a sparse tensor
    """
    words = tf.string_split(inputs['text'],DELIMITERS_WORDS)
    word_representation = tft.compute_and_apply_vocabulary(words,default_value=0,top_k=10000)
    inputs["word_representation"] = word_representation
    return inputs
def preprocessing_fn(inputs):
    """Preprocesses Titanic Dataset."""

    outputs = {}

    # Scale numerical features
    for key in features.NUMERIC_FEATURE_KEYS:
        mean_value = compute_mean_ignore_nan(inputs[key].values)
        absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}')
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing_with_impute(inputs[key], mean_value))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE),
            num_oov_buckets=features.OOV_SIZE)

    for key in features.BUCKET_FEATURE_KEYS:
        if key in features.FEATURE_BUCKET_BOUNDARIES:
            bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key))
            # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info)
            outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]),
                                                                        bucket_boundaries)
        else:
            outputs[features.transformed_name(key)] = tft.bucketize(
                _fill_in_missing(inputs[key]),
                features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT))

    # Generate vocabularies and maps categorical features
    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key)

    # Convert Cover_Type to dense tensor
    outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
        inputs[features.LABEL_KEY])

    return outputs
Exemple #15
0
def preprocessing_fn(inputs):
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = x_centered * y_normalized
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }
Exemple #16
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
    Map from string feature key to transformed feature operations.
    """
    outputs = {}
    outputs["id"] = inputs["id"]
    tokens = tokenize_reviews(_fill_in_missing(inputs["text"], ''))
    outputs["text_xf"] = tft.compute_and_apply_vocabulary(
        tokens, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)
    outputs["label_xf"] = _fill_in_missing(inputs["label"], -1)
    return outputs
def preprocessing_fn(inputs):
    text_fields = []
    # Keep the original data and add more to it.
    result = inputs.copy()
    # Figure out the vocabulary for our text fields.
    for field_name in text_fields:
        field = inputs[field_name]
        tokens = tf.strings.split(text, " ")
        bag_of_words = tft.bag_of_words(tokens, range(1,3), seperator=" ")
        indices = tft.compute_and_apply_vocabulary(bag_of_words)
        bow_indices, weights = tft.tfidf(line_indices)
        outputs[f"{field_name}_bow_indices"] = bow_indices
        outputs[f"{field_name}_weight"] weights
    return result
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_KEY]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.compute_and_apply_vocabulary(
            review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by compute_and_apply_vocabulary.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_KEY: review_bow_indices,
            REVIEW_WEIGHT_KEY: review_weight,
            LABEL_KEY: inputs[LABEL_KEY]
        }
def _preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }
Exemple #20
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in features.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(
            key)] = tft.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                top_k=features.VOCAB_SIZE,
                num_oov_buckets=features.OOV_SIZE)

    for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                                features.BUCKET_FEATURE_BUCKET_COUNT):
        outputs[features.transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]),
            num_buckets,
            always_return_num_quantiles=False)

    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    fare_key = 'fare'
    taxi_fare = _fill_in_missing(inputs[fare_key])
    tips = _fill_in_missing(inputs[features.LABEL_KEY])
    outputs[features.transformed_name(
        features.LABEL_KEY)] = tf.compat.v1.where(
            tf.math.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

    return outputs
Exemple #21
0
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

        Args:
          inputs: map from feature keys to raw not-yet-transformed features.

        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in my_metadata.NUMERIC_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[my_metadata.transformed_name(key)] = transform.scale_to_z_score(_fill_in_missing(inputs[key]))

        for key in my_metadata.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[my_metadata.transformed_name(key)] = transform.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                vocab_filename=my_metadata.transformed_name(key),
                num_oov_buckets=my_metadata.OOV_SIZE,
                top_k=my_metadata.VOCAB_SIZE
            )

        for key, hash_buckets in my_metadata.HASH_STRING_FEATURE_KEYS.items():
            outputs[my_metadata.transformed_name(key)] = transform.hash_strings(
                _fill_in_missing(inputs[key]),
                hash_buckets=hash_buckets
            )

        for key, nb_buckets in my_metadata.TO_BE_BUCKETIZED_FEATURE.items():
            outputs[my_metadata.transformed_name(key +'_bucketized')] = transform.bucketize(
                _fill_in_missing(inputs[key]), nb_buckets)


        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[my_metadata.FARE_KEY])
        tips = _fill_in_missing(inputs[my_metadata.LABEL_KEY])
        outputs[my_metadata.transformed_name(my_metadata.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(
                tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

        return outputs
Exemple #22
0
def preprocessing_fn(inputs):
  """
  Perform feature reduction via `compute_and_apply_vocabulary`. An
  `indices` tensor should come in with values in (0,5e7) and should
  be transformed to (0,40000).
  """

  outputs = inputs

  outputs['indices'] = (
    tft.compute_and_apply_vocabulary(x=inputs['indices'],
                                     top_k=(MAX_IDX-5),
                                     num_oov_buckets=5,
                                     vocab_filename='my_vocab')
  )
  
  return outputs
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(outputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            sparse = tf.sparse.SparseTensor(inputs[key].indices,
                                            inputs[key].values,
                                            [inputs[key].dense_shape[0], 1])
            dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.compute_and_apply_vocabulary(inputs[key],
                                                            vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)), tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64)
        table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        data_labels = table.lookup(inputs[LABEL_KEY])
        outputs[LABEL_KEY] = tf.one_hot(indices=data_labels,
                                        depth=len(table_keys),
                                        on_value=1.0,
                                        off_value=0.0)

        return outputs
Exemple #24
0
        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }
Exemple #25
0
def preprocessing_fn(inputs):
    """
    Preprocess input columns into transformed columns.
    Args:
        inputs (dict): dict of input columns
    Returns:
        output dict of transformed columns
    """
    outputs = {}
    # Encode categorical column:
    outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary(
        inputs['MixingSpeed'])
    outputs['ButterMass'] = inputs['ButterMass']
    # Calculate Derived Features:
    outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[
        'FlourMass']
    for ingredient in ['Butter', 'Sugar', 'Flour']:
        ingredient_percentage = inputs['{}Mass'.format(
            ingredient)] / outputs['TotalMass']
        outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score(
            ingredient_percentage)
    # Keep absolute numeric columns
    for key in ['TotalVolume', 'Energy']:
        outputs[key] = inputs[key]
    # Normalize other numeric columns
    for key in [
            'ButterTemperature',
            'SugarHumidity',
            'FlourHumidity',
            'HeatingTime',
            'MixingTime',
            'Density',
            'Temperature',
            'Humidity',
    ]:
        outputs[key] = tft.scale_to_z_score(inputs[key])
    # Extract Specific Problems
    chunks_detected_str = tf.regex_replace(input=inputs['Problems'],
                                           pattern='.*chunk.*',
                                           rewrite='chunk',
                                           name='DetectChunk')
    outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'),
                                tf.float32)
    return outputs
Exemple #26
0
        def wide_preprocessing_fn(inputs):
            """TFT preprocessing function.

      Args:
        inputs: Map from feature keys to raw not-yet-transformed features.

      Returns:
        Map from string feature key to transformed feature operations.
      """
            outputs = {}
            # pylint: disable=protected-access
            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._BUCKET_FEATURE_KEYS),
                        self._num_bucketize)):
                outputs["bucketized" + str(idx)] = tft.bucketize(
                    taxi_utils._fill_in_missing(inputs[key]),
                    taxi_utils._FEATURE_BUCKET_COUNT)

            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._DENSE_FLOAT_FEATURE_KEYS),
                        self._num_scale)):
                # Preserve this feature as a dense float, setting nan's to the mean.
                outputs["scaled" + str(idx)] = tft.scale_to_z_score(
                    taxi_utils._fill_in_missing(inputs[key]))

            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._VOCAB_FEATURE_KEYS),
                        self._num_vocabs)):
                outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary(
                    taxi_utils._fill_in_missing(inputs[key]),
                    top_k=taxi_utils._VOCAB_SIZE,
                    num_oov_buckets=taxi_utils._OOV_SIZE)

            # Pass-through features.
            for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [
                    taxi_utils._LABEL_KEY
            ]:
                outputs[key] = inputs[key]

            return outputs
  def def_preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs
def preprocessing_fn(inputs):
  """Preprocesses Covertype Dataset."""

  outputs = {}

  # Scale numerical features
  for key in features.NUMERIC_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  # Generate vocabularies and maps categorical features
  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key)

  # Convert Cover_Type to dense tensor
  outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
      inputs[features.LABEL_KEY])

  return outputs
Exemple #29
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key])

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            inputs[key], top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE)

    outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

    return outputs
Exemple #30
0
def preprocessing_fn(inputs, custom_config):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    custom_config: additional properties for pre-processing.

  Returns:
    Map from string feature key to transformed features.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(_identity(inputs[key])))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=custom_config.get('VOCAB_SIZE', _VOCAB_SIZE),
            num_oov_buckets=custom_config.get('OOV_SIZE', _OOV_SIZE))

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
    tips = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
        tf.math.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

    return outputs
def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor:
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}

    for key in ONE_HOT_FEATURES.keys():
        dim = ONE_HOT_FEATURES[key] 
        #calling the fill_in_missing function
        int_value = tft.compute_and_apply_vocabulary(
            fill_in_missing(inputs[key]), top_k = dim + 1
        )
        outputs[transformed_name(key)] = convert_num_to_one_hot(
            int_value, num_labels = dim + 1
        )

    for key, bucket_count in BUCKET_FEATURES.items():
        dense_feature = fill_in_missing(inputs[key])
        if key == 'zip_code' and dense_feature.dtype == tf.string:
            dense_feature = convert_zip_code(dense_feature)
        else:
            dense_feature = tf.cast(dense_feature, tf.float32)

        temp_feature = tft.bucketize(dense_feature, bucket_count,
                                always_return_num_quantiles= False)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
            temp_feature, num_labels = bucket_count + 1
        )
    
    for key in TEXT_FEATURES.keys():
        #it's probably clearer to separate function from dict key
        outputs[transformed_name(key)] = fill_in_missing(inputs[key])

    outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY])

    return outputs