Exemple #1
0
  def testBucketizedFeatures(self):
    """Tests LinearClassifier with LinearSDCA and bucketized features."""

    def input_fn():
      return {
          'example_id': constant_op.constant(['1', '2', '3']),
          'price': constant_op.constant([[600.0], [1000.0], [400.0]]),
          'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]),
          'weights': constant_op.constant([[1.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('price'),
        boundaries=[500.0, 700.0])
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0])
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)
    classifier = linear.LinearClassifier(
        feature_columns=[price_bucket, sq_footage_bucket],
        weight_column='weights',
        optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
Exemple #2
0
def embedding_varlen(batch_size, max_length):
  """Benchmark a variable-length embedding."""
  # Data and constants.
  max_value = 25.0
  bins = np.arange(1.0, max_value)
  data = fc_bm.create_data(
      max_length, batch_size * NUM_REPEATS, 100000, dtype=float)

  # Keras implementation
  model = keras.Sequential()
  model.add(keras.Input(shape=(max_length,), name="data", dtype=dt.float32))
  model.add(discretization.Discretization(bins))

  # FC implementation
  fc = fcv2.bucketized_column(
      fcv2.numeric_column("data"), boundaries=list(bins))

  # Wrap the FC implementation in a tf.function for a fair comparison
  @tf_function()
  def fc_fn(tensors):
    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

  # Benchmark runs
  keras_data = {"data": data.to_tensor(default_value=0.0)}
  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

  fc_data = {"data": data.to_tensor(default_value=0.0)}
  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

  return k_avg_time, fc_avg_time
Exemple #3
0
  def testPartitionedVariables(self):
    """Tests LinearClassifier with LinearSDCA with partitioned variables."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)

    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)

    classifier = linear.LinearClassifier(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        partitioner=partitioned_variables.fixed_size_partitioner(
            num_shards=2, axis=0),
        optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
    def testFeatureColumns(self):
        # TODO(b/120099662): Error with table initialization with Keras models in
        # graph mode.
        if context.executing_eagerly():
            numeric = fc.numeric_column('a')
            bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15])
            cat_vocab = fc.categorical_column_with_vocabulary_list(
                'b', ['1', '2', '3'])
            one_hot = fc.indicator_column(cat_vocab)
            embedding = fc.embedding_column(cat_vocab, dimension=8)
            feature_layer = DenseFeatures([bucketized, one_hot, embedding])
            model = keras.models.Sequential(feature_layer)

            features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])}
            predictions = model.predict(features)

            saved_model_dir = self._save_model_dir()
            model.save(saved_model_dir, save_format='tf')
            loaded = keras_load.load(saved_model_dir)
            loaded_predictions = loaded.predict(features)
            self.assertAllClose(predictions, loaded_predictions)
Exemple #5
0
  def test_deserialization_deduping(self):
    price = fc.numeric_column('price')
    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])

    configs = serialization.serialize_feature_columns([price, bucketized_price])

    deserialized_feature_columns = serialization.deserialize_feature_columns(
        configs)
    self.assertLen(deserialized_feature_columns, 2)
    new_price = deserialized_feature_columns[0]
    new_bucketized_price = deserialized_feature_columns[1]

    # Ensure these are not the original objects:
    self.assertIsNot(price, new_price)
    self.assertIsNot(bucketized_price, new_bucketized_price)
    # But they are equivalent:
    self.assertEqual(price, new_price)
    self.assertEqual(bucketized_price, new_bucketized_price)

    # Check that deduping worked:
    self.assertIs(new_bucketized_price.source_column, new_price)
Exemple #6
0
def _get_categorical_column(params: dict) -> fc.CategoricalColumn:
    if 'vocabulary' in params.keys():
        feature = fc.categorical_column_with_vocabulary_list(params['key'],
                                                             vocabulary_list=_parse_vocabulary(
                                                                 params['vocabulary']),
                                                             default_value=0)
    elif 'bucket_size' in params.keys():
        feature = fc.categorical_column_with_hash_bucket(params['key'],
                                                         hash_bucket_size=params['bucket_size'])
    elif 'file' in params.keys():
        feature = fc.categorical_column_with_vocabulary_file(params['key'],
                                                             vocabulary_file=params['file'],
                                                             default_value=0)
    elif 'num_buckets' in params.keys():
        feature = fc.categorical_column_with_identity(params['key'],
                                                      num_buckets=params['num_buckets'])
    elif 'boundaries' in params.keys():
        feature = fc.bucketized_column(fc.numeric_column(
            params['key']), boundaries=params['boundaries'])
    else:
        raise Exception("params error")

    return feature
Exemple #7
0
  def testMixedFeaturesArbitraryWeights(self):
    """Tests LinearRegressor with LinearSDCA and a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([0.6, 0.8, 0.3]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.1)
    regressor = linear.LinearRegressor(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        optimizer=optimizer)
    regressor.train(input_fn=input_fn, steps=20)
    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.05)