Example #1
0
def _build_keras_model(tf_transform_output, hidden_units, learning_rate):
    """Creates a DNN Keras model for classifying taxi data.
  Args:
    hidden_units: [int], the layer sizes of the DNN (input layer first).
  Returns:
    A keras Model.
  """

    numeric_columns = [
        tf.feature_column.numeric_column(key=features.transformed_name(key),
                                         shape=())
        for key in features.NUMERIC_FEATURE_KEYS
    ]

    categorical_columns = [
        tf.feature_column.categorical_column_with_identity(
            key=features.transformed_name(key),
            num_buckets=tf_transform_output.
            num_buckets_for_transformed_feature(
                features.transformed_name(key)),
            default_value=0) for key in features.CATEGORICAL_FEATURE_KEYS
    ]

    indicator_columns = [
        tf.feature_column.indicator_column(categorical_column)
        for categorical_column in categorical_columns
    ]

    model = _wide_and_deep_classifier(
        # TODO(b/139668410) replace with premade wide_and_deep keras model
        wide_columns=indicator_columns,
        deep_columns=numeric_columns,
        dnn_hidden_units=hidden_units,
        learning_rate=learning_rate)
    return model
Example #2
0
def preprocessing_fn(inputs):
  """Preprocesses input columns into transformed columns.
  Preprocesses Covertype Dataset features using Tensorflow Transform library.
  Args:
    inputs(dict): A `dict` of `string` to `Tensor` or `SparseTensor`, where key is a
      Features key in Example proto, value is a Tensor containing the Feature
      proto's value.

  Returns:
    outputs(dict): A `dict` of `string` to `Tensor` or `SparseTensor`, where key is a new set
    of Feature keys, and values are possibly transformed `Tensor` or
    `SparseTensor`.    
    
  """

  outputs = {}

  # Scale numerical features
  for key in features.NUMERIC_FEATURE_KEYS:
    # TODO: your code here to scale numeric features with z-score with Tensorflow Transform.        
    outputs[features.transformed_name(key)] = 

  # Generate vocabularies and maps categorical features
  for key in features.CATEGORICAL_FEATURE_KEYS:
    # TODO: your code here to integerize categorical features and generate vocabulary file with Tensorflow Transform.
    outputs[features.transformed_name(key)] =

  # Convert Cover_Type to dense tensor
  outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
      inputs[features.LABEL_KEY])

  return outputs
Example #3
0
def _build_keras_model(hparams: kerastuner.HyperParameters, 
                       tf_transform_output: tft.TFTransformOutput) -> tf.keras.Model:
  """Creates a Keras WideDeep Classifier model.
  Args:
    hparams: Holds HyperParameters for tuning.
    tf_transform_output: A TFTransformOutput.
  Returns:
    A keras Model.
  """
  # Defines deep feature columns and input layers.
  deep_columns = [
      tf.feature_column.numeric_column(
          key=features.transformed_name(key), 
          shape=())
      for key in features.NUMERIC_FEATURE_KEYS
  ]
  
  input_layers = {
      column.key: tf.keras.layers.Input(name=column.key, shape=(), dtype=tf.float32)
      for column in deep_columns
  }    

  # Defines wide feature columns and input layers.
  categorical_columns = [
      tf.feature_column.categorical_column_with_identity(
          key=features.transformed_name(key), 
          num_buckets=tf_transform_output.num_buckets_for_transformed_feature(features.transformed_name(key)), 
          default_value=0)
      for key in features.CATEGORICAL_FEATURE_KEYS
  ]

  wide_columns = [
      tf.feature_column.indicator_column(categorical_column)
      for categorical_column in categorical_columns
  ]
    
  input_layers.update({
      column.categorical_column.key: tf.keras.layers.Input(name=column.categorical_column.key, shape=(), dtype=tf.int32)
      for column in wide_columns
  })

  # Build Keras model using hparams.
  deep = tf.keras.layers.DenseFeatures(deep_columns)(input_layers)
  for n in range(int(hparams.get('n_layers'))):
    deep = tf.keras.layers.Dense(units=hparams.get('n_units_' + str(n + 1)))(deep)

  wide = tf.keras.layers.DenseFeatures(wide_columns)(input_layers)

  output = tf.keras.layers.Dense(features.NUM_CLASSES, activation='softmax')(
               tf.keras.layers.concatenate([deep, wide]))

  model = tf.keras.Model(input_layers, output)
  model.compile(
      loss='sparse_categorical_crossentropy',
      optimizer=tf.keras.optimizers.Adam(lr=hparams.get('learning_rate')),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
  model.summary(print_fn=absl.logging.info)

  return model    
Example #4
0
def _input_fn(file_pattern: List[Text],
              data_accessor: DataAccessor,
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size,
            label_key=features.transformed_name(features.LABEL_KEY)),
        tf_transform_output.transformed_metadata.schema)

    return dataset
Example #5
0
def _input_fn(filenames, tf_transform_output, batch_size=200):
    """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
    transformed_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())

    dataset = tf.data.experimental.make_batched_features_dataset(
        filenames,
        batch_size,
        transformed_feature_spec,
        reader=_gzip_reader_fn)

    transformed_features = tf.compat.v1.data.make_one_shot_iterator(
        dataset).get_next()
    # We pop the label because we do not want to use it as a feature while we're
    # training.
    return transformed_features, transformed_features.pop(
        features.transformed_name(features.LABEL_KEY))
Example #6
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in features.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(
            key)] = tft.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                top_k=features.VOCAB_SIZE,
                num_oov_buckets=features.OOV_SIZE)

    for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                                features.BUCKET_FEATURE_BUCKET_COUNT):
        outputs[features.transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]),
            num_buckets,
            always_return_num_quantiles=False)

    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    fare_key = 'fare'
    taxi_fare = _fill_in_missing(inputs[fare_key])
    tips = _fill_in_missing(inputs[features.LABEL_KEY])
    outputs[features.transformed_name(
        features.LABEL_KEY)] = tf.compat.v1.where(
            tf.math.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

    return outputs
Example #7
0
    def serve_tf_examples_fn(serialized_tf_examples):
        """Returns the output to be used in the serving signature."""
        feature_spec = tf_transform_output.raw_feature_spec()
        feature_spec.pop(features.LABEL_KEY)
        parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)

        transformed_features = model.tft_layer(parsed_features)
        transformed_features.pop(features.transformed_name(features.LABEL_KEY))

        return model(transformed_features)
Example #8
0
def preprocessing_fn(inputs):
    """Preprocesses Covertype Dataset."""

    outputs = {}

    # Scale numerical features
    for key in features.NUMERIC_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    # Generate vocabularies and maps categorical features
    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(
            key)] = tft.compute_and_apply_vocabulary(x=_fill_in_missing(
                inputs[key]),
                                                     num_oov_buckets=1,
                                                     vocab_filename=key)

    # Convert Cover_Type to dense tensor
    outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
        inputs[features.LABEL_KEY])

    return outputs
Example #9
0
def _eval_input_receiver_fn(tf_transform_output, schema):
    """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
    # Notice that the inputs are raw features, not transformed features here.
    raw_feature_spec = _get_raw_feature_spec(schema)

    serialized_tf_example = tf.compat.v1.placeholder(
        dtype=tf.string, shape=[None], name='input_example_tensor')

    # Add a parse_example operator to the tensorflow graph, which will parse
    # raw, untransformed, tf examples.
    raw_features = tf.io.parse_example(serialized=serialized_tf_example,
                                       features=raw_feature_spec)

    # Now that we have our raw examples, process them through the tf-transform
    # function computed during the preprocessing step.
    transformed_features = tf_transform_output.transform_raw_features(
        raw_features)

    # The key name MUST be 'examples'.
    receiver_tensors = {'examples': serialized_tf_example}

    # NOTE: Model is driven by transformed features (since training works on the
    # materialized output of TFT, but slicing will happen on raw features.
    raw_features.update(transformed_features)

    return tfma.export.EvalInputReceiver(
        features=raw_features,
        receiver_tensors=receiver_tensors,
        labels=transformed_features[features.transformed_name(
            features.LABEL_KEY)])
Example #10
0
def _input_fn(file_pattern, tf_transform_output, batch_size=200):
    """Generates features and label for tuning/training.
  Args:
    file_pattern: input tfrecord file pattern.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch
  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    transformed_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transformed_feature_spec,
        reader=_gzip_reader_fn,
        label_key=features.transformed_name(features.LABEL_KEY))

    return dataset
def preprocessing_fn(inputs):
    """Preprocesses Titanic Dataset."""

    outputs = {}

    # Scale numerical features
    for key in features.NUMERIC_FEATURE_KEYS:
        mean_value = compute_mean_ignore_nan(inputs[key].values)
        absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}')
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing_with_impute(inputs[key], mean_value))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE),
            num_oov_buckets=features.OOV_SIZE)

    for key in features.BUCKET_FEATURE_KEYS:
        if key in features.FEATURE_BUCKET_BOUNDARIES:
            bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key))
            # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info)
            outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]),
                                                                        bucket_boundaries)
        else:
            outputs[features.transformed_name(key)] = tft.bucketize(
                _fill_in_missing(inputs[key]),
                features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT))

    # Generate vocabularies and maps categorical features
    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key)

    # Convert Cover_Type to dense tensor
    outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
        inputs[features.LABEL_KEY])

    return outputs
Example #12
0
def _build_keras_model(
        hparams: kerastuner.HyperParameters,
        tf_transform_output: tft.TFTransformOutput) -> tf.keras.Model:
    """Creates a Keras WideDeep Classifier model.
    Args:
      hparams: Holds HyperParameters for tuning.
      tf_transform_output: A TFTransformOutput.
    Returns:
      A keras Model.
    """

    real_keys = features.NUMERIC_FEATURE_KEYS
    sparse_keys = features.VOCAB_FEATURE_KEYS + features.BUCKET_FEATURE_KEYS + features.CATEGORICAL_FEATURE_KEYS

    # Defines deep feature columns and input layers.
    deep_columns = [
        tf.feature_column.numeric_column(key=features.transformed_name(key),
                                         shape=())
        for key in features.NUMERIC_FEATURE_KEYS
    ]

    input_layers = {
        column.key: tf.keras.layers.Input(name=column.key,
                                          shape=(),
                                          dtype=tf.float32)
        for column in deep_columns
    }

    # Defines wide feature columns and input layers.
    categorical_columns = [
        tf.feature_column.categorical_column_with_identity(
            key=features.transformed_name(key),
            num_buckets=tf_transform_output.
            num_buckets_for_transformed_feature(
                features.transformed_name(key)),
            default_value=0) for key in features.CATEGORICAL_FEATURE_KEYS
    ]

    categorical_columns += [
        tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
            key,
            num_buckets=features.VOCAB_SIZE + features.OOV_SIZE,
            default_value=0)
        for key in features.transformed_names(features.VOCAB_FEATURE_KEYS)
    ]

    categorical_columns += [
        tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
            key,
            num_buckets=num_buckets,
            default_value=0) for key, num_buckets in zip(
                features.transformed_names(features.BUCKET_FEATURE_KEYS),
                features.BUCKET_FEATURE_BUCKET_COUNT)
    ]

    wide_columns = [
        tf.feature_column.indicator_column(categorical_column)
        for categorical_column in categorical_columns
    ]

    input_layers.update({
        column.categorical_column.key:
        tf.keras.layers.Input(name=column.categorical_column.key,
                              shape=(),
                              dtype=tf.int32)
        for column in wide_columns
    })

    # Build Keras model using hparams.
    deep = tf.keras.layers.DenseFeatures(deep_columns)(input_layers)
    for n in range(int(hparams.get('n_layers'))):
        deep = tf.keras.layers.Dense(units=hparams.get('n_units_' +
                                                       str(n + 1)))(deep)

    wide = tf.keras.layers.DenseFeatures(wide_columns)(input_layers)

    # output = tf.keras.layers.Dense(features.NUM_CLASSES, activation='softmax')(
    #             tf.keras.layers.concatenate([deep, wide]))

    output = tf.keras.layers.Dense(1, activation='sigmoid')(
        tf.keras.layers.concatenate([deep, wide]))
    output = tf.squeeze(output, -1)

    model = tf.keras.Model(input_layers, output)
    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(lr=hparams.get('learning_rate')),
        # metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
        metrics=[
            tf.keras.metrics.TruePositives(name='tp'),
            tf.keras.metrics.FalsePositives(name='fp'),
            tf.keras.metrics.TrueNegatives(name='tn'),
            tf.keras.metrics.FalseNegatives(name='fn'),
            tf.keras.metrics.BinaryAccuracy(name='binary_accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='auc'),
        ])
    model.summary(print_fn=absl.logging.info)

    return model