def def_preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs
Exemple #2
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # If sparse make it dense, setting nan's to 0 or '', and apply zscore.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=_VOCAB_SIZE,
            num_oov_buckets=_OOV_SIZE)

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
    tips = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
        tf.math.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

    return outputs
Exemple #3
0
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in ts.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[key] = transform.scale_to_z_score(inputs[key])

  for key in ts.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[key] = transform.string_to_int(
        inputs[key], top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

  for key in ts.BUCKET_FEATURE_KEYS:
    outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT)

  for key in ts.CATEGORICAL_FEATURE_KEYS:
    outputs[key] = inputs[key]

  # Was this passenger a big tipper?
  def convert_label(label):
    taxi_fare = inputs[ts.FARE_KEY]
    return tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 5% of the fare.
        tf.cast(
            tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.05))),
            tf.int64))

  outputs[ts.LABEL_KEY] = transform.apply_function(convert_label,
                                                     inputs[ts.LABEL_KEY])

  return outputs
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # bucketize numeric columns
    for key in TO_BE_BUCKETIZED_FEATURE:
      outputs[key+'_bucketized'] = tft.bucketize(
          inputs[key],
          TO_BE_BUCKETIZED_FEATURE[key]
      )


    # For categorical columns with a small vocabulary
    for key in STRING_TO_INT_FEATURE_KEYS:
      outputs[key] = tft.string_to_int(
          inputs[key],
          vocab_filename=key)

    for key in HASH_STRING_FEATURE_KEYS:
      outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key])

    # For the label column we transform it either 0 or 1 if there are row leads
    def convert_label(label):
      """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
      table = lookup.index_table_from_tensor(['<=50K', '>50K'])
      return table.lookup(label)

    outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY])
    return outputs
def preprocessing_fn(inputs):
    """Preprocesses Titanic Dataset."""

    outputs = {}

    # Scale numerical features
    for key in features.NUMERIC_FEATURE_KEYS:
        mean_value = compute_mean_ignore_nan(inputs[key].values)
        absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}')
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing_with_impute(inputs[key], mean_value))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE),
            num_oov_buckets=features.OOV_SIZE)

    for key in features.BUCKET_FEATURE_KEYS:
        if key in features.FEATURE_BUCKET_BOUNDARIES:
            bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key))
            # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info)
            outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]),
                                                                        bucket_boundaries)
        else:
            outputs[features.transformed_name(key)] = tft.bucketize(
                _fill_in_missing(inputs[key]),
                features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT))

    # Generate vocabularies and maps categorical features
    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key)

    # Convert Cover_Type to dense tensor
    outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
        inputs[features.LABEL_KEY])

    return outputs
Exemple #6
0
def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor:
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}

    for key in ONE_HOT_FEATURES.keys():
        dim = ONE_HOT_FEATURES[key]
        int_value = tft.compute_and_apply_vocabulary(fill_in_missing(
            inputs[key]),
                                                     top_k=dim + 1)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
            int_value, num_labels=dim + 1)

    for key, bucket_count in BUCKET_FEATURES.items():

        dense_feature = fill_in_missing(inputs[key])
        if key == "zip_code" and dense_feature.dtype == tf.string:
            dense_feature = convert_zip_code(dense_feature)
        else:
            dense_feature = tf.cast(dense_feature, tf.float32)

        temp_feature = tft.bucketize(dense_feature,
                                     bucket_count,
                                     always_return_num_quantiles=False)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
            temp_feature, num_labels=bucket_count + 1)

    for key in TEXT_FEATURES.keys():
        outputs[transformed_name(key)] = fill_in_missing(inputs[key])

    outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY])

    return outputs
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
        inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
        Map from string feature key to transformed feature operations.
    """
    outputs = {}

    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=_VOCAB_SIZE,
            num_oov_buckets=_OOV_SIZE)

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]),
            num_buckets=_FEATURE_BUCKET_COUNT,
            always_return_num_quantiles=False)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper? - our classification goal
    taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
    tips = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.where(
        tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        tf.cast(tf.greater(tips, taxi_fare * tf.constant(0.2)), tf.int64))

    return outputs
Exemple #8
0
    def wide_preprocessing_fn(inputs):
      """TFT preprocessing function.

      Args:
        inputs: Map from feature keys to raw not-yet-transformed features.

      Returns:
        Map from string feature key to transformed feature operations.
      """
      outputs = {}
      for idx, key in enumerate(
          itertools.islice(
              itertools.cycle(self._BUCKETIZE_KEYS),
              self._num_bucketize)):
        outputs["bucketized" + str(idx)] = tft.bucketize(
            taxi_utils._fill_in_missing(inputs[key]),
            taxi_utils._FEATURE_BUCKET_COUNT)

      for idx, key in enumerate(
          itertools.islice(itertools.cycle(self._SCALE_KEYS), self._num_scale)):
        # If sparse make it dense, setting nan's to 0 or '', and apply zscore.
        outputs["scaled" + str(idx)] = tft.scale_to_z_score(
            taxi_utils._fill_in_missing(inputs[key]))

      for idx, key in enumerate(
          itertools.islice(
              itertools.cycle(self._VOCABULARY_KEYS),
              self._num_vocabs)):
        outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary(
            taxi_utils._fill_in_missing(inputs[key]),
            top_k=taxi_utils._VOCAB_SIZE,
            num_oov_buckets=taxi_utils._OOV_SIZE)

      # Pass-through features.
      for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [taxi_utils._LABEL_KEY]:
        outputs[key] = inputs[key]

      return outputs
Exemple #9
0
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in features.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in features.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=features.VOCAB_SIZE,
        num_oov_buckets=features.OOV_SIZE)

  for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                              features.BUCKET_FEATURE_BUCKET_COUNT):
    outputs[features.transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        num_buckets)

  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[features.transformed_name(
      features.LABEL_KEY)] = inputs[features.LABEL_KEY]

  return outputs
Exemple #10
0
def preprocess(inputs):
    """tf.transform's callback function for preprocessing inputs.
  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[key] = transform.scale_to_z_score(inputs[key])

    for key in VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        if inputs[key].dtype == tf.string:
            vocab_tensor = inputs[key]
        else:
            vocab_tensor = tf.as_string(inputs[key])
        outputs[key] = transform.string_to_int(vocab_tensor,
                                               vocab_filename='vocab_' + key,
                                               top_k=VOCAB_SIZE,
                                               num_oov_buckets=OOV_SIZE)

    for key in BUCKET_FEATURE_KEYS:
        outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT)

    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tf.to_int64(inputs[key])

    taxi_fare = inputs[FARE_KEY]
    taxi_tip = inputs[LABEL_KEY]
    # Test if the tip was > 20% of the fare.
    tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
    outputs[LABEL_KEY] = tf.logical_and(tf.logical_not(tf.is_nan(taxi_fare)),
                                        tf.greater(taxi_tip, tip_threshold))

    return outputs
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # bucketize numeric columns
        for key in TO_BE_BUCKETIZED_FEATURE:
            outputs[key + '_bucketized'] = tft.bucketize(
                inputs[key], TO_BE_BUCKETIZED_FEATURE[key])

        # For categorical columns with a small vocabulary
        for key in STRING_TO_INT_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key)

        for key in HASH_STRING_FEATURE_KEYS:
            outputs[key] = tft.hash_strings(inputs[key],
                                            HASH_STRING_FEATURE_KEYS[key])

        # For the label column we transform it either 0 or 1 if there are row leads
        def convert_label(label):
            """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
            table = lookup.index_table_from_tensor(['<=50K', '>50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])
        return outputs
Exemple #12
0
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1]
    for key in NUMERIC_FEATURE_KEYS:
        outputs[key] = tft.scale_to_0_1(inputs[key])

    for key in NUMERIC_FEATURE_KEYS_INT:
        outputs[key] = tft.scale_to_0_1(inputs[key])

    # Bucketize numeric columns
    for key in TO_BE_BUCKETIZED_FEATURE:
        outputs[f'{key}_b'] = tft.bucketize(inputs[key],
                                            TO_BE_BUCKETIZED_FEATURE[key])

    for key in HASH_STRING_FEATURE_KEYS:
        outputs[key] = tft.hash_strings(inputs[key],
                                        HASH_STRING_FEATURE_KEYS[key])

    # For the label column we transform it either 0 or 1 if there are row leads
    outputs[LABEL_KEY] = inputs[LABEL_KEY]

    return outputs
 def preprocessing_fn(inputs):
     return {
         'x_bucketized':
         tft.bucketize(inputs['x'], num_buckets=3, epsilon=0.00001)
     }
Exemple #14
0
 def transform(self, name, values):
     value = Feature.fill_in_missing(values[name])
     if self.buckets is None:
         return tft.scale_to_z_score(value)
     return tft.bucketize(value, self.buckets)
Exemple #15
0
  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      print('processing key', key)
      print('input:', inputs[key])
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    # for key in taxi.FEATURE_NGRAM:
    #   # Extract nggrams and build a vocab.
    #   outputs[
    #       taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
    #           transform.ngrams(
    #             tf.string_split(_fill_in_missing(inputs[key])),
    #             ngram_range=taxi.NGRAM_RANGE,
    #             separator=' '),
    #           top_k=512,
    #           num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.FEATURE_NGRAM:
      # Extract nggrams and build a vocab.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs