def create_feature_columns():
    age = vocabulary_column('age_level', [c for c in range(1, 7)])
    gender = vocabulary_column('gender', [-1, 1])

    all_cat_cross = crossed_column([age, gender], hash_bucket_size=100)

    categorical_column = [indicator_column(age), indicator_column(gender)]

    crossed_columns = [indicator_column(all_cat_cross)]

    numerical_column = []

    range_0_20 = [c for c in range(0, 20)]

    embedding_columns = [
        embedding_column(vocabulary_column("order_cnt", range_0_20),
                         dimension=1),
        embedding_column(age, dimension=1),
        embedding_column(gender, dimension=1),
        embedding_column(all_cat_cross, dimension=10)
    ]

    wide_columns = categorical_column + crossed_columns
    deep_columns = numerical_column + embedding_columns
    return wide_columns, deep_columns
Exemple #2
0
  def testCrossedFeatures(self):
    """Tests LinearClassifier with LinearSDCA and crossed features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english', 'italian', 'spanish'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['US', 'IT', 'MX'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1])
      }, constant_op.constant([[0], [0], [1]])

    country_language = feature_column_v2.crossed_column(
        ['language', 'country'], hash_bucket_size=100)
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)
    classifier = linear.LinearClassifier(
        feature_columns=[country_language], optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
Exemple #3
0
    def test_crossed_column(self):
        a = fc.categorical_column_with_vocabulary_list(
            'a', vocabulary_list=['1', '2', '3'])
        b = fc.categorical_column_with_vocabulary_list(
            'b', vocabulary_list=['1', '2', '3'])
        ab = fc.crossed_column([a, b], hash_bucket_size=2)
        cols = [fc.indicator_column(ab)]

        orig_layer = df.DenseFeatures(cols)
        config = orig_layer.get_config()

        new_layer = df.DenseFeatures.from_config(config)

        self.assertLen(new_layer._feature_columns, 1)
        self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data_a = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)
    data_b = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)

    # Keras implementation
    input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string)
    input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string)
    crossed_data = category_crossing.CategoryCrossing()([input_1, input_2])
    hashed_data = hashing.Hashing(num_buckets)(crossed_data)
    model = keras.Model([input_1, input_2], hashed_data)

    # FC implementation
    fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Exemple #5
0
  def testPartitionedVariables(self):
    """Tests LinearClassifier with LinearSDCA with partitioned variables."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)

    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)

    classifier = linear.LinearClassifier(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        partitioner=partitioned_variables.fixed_size_partitioner(
            num_shards=2, axis=0),
        optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
Exemple #6
0
  def testMixedFeaturesArbitraryWeights(self):
    """Tests LinearRegressor with LinearSDCA and a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([0.6, 0.8, 0.3]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.1)
    regressor = linear.LinearRegressor(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        optimizer=optimizer)
    regressor.train(input_fn=input_fn, steps=20)
    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.05)