def create_feature_columns(): age = vocabulary_column('age_level', [c for c in range(1, 7)]) gender = vocabulary_column('gender', [-1, 1]) all_cat_cross = crossed_column([age, gender], hash_bucket_size=100) categorical_column = [indicator_column(age), indicator_column(gender)] crossed_columns = [indicator_column(all_cat_cross)] numerical_column = [] range_0_20 = [c for c in range(0, 20)] embedding_columns = [ embedding_column(vocabulary_column("order_cnt", range_0_20), dimension=1), embedding_column(age, dimension=1), embedding_column(gender, dimension=1), embedding_column(all_cat_cross, dimension=10) ] wide_columns = categorical_column + crossed_columns deep_columns = numerical_column + embedding_columns return wide_columns, deep_columns
def testCrossedFeatures(self): """Tests LinearClassifier with LinearSDCA and crossed features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'language': sparse_tensor.SparseTensor( values=['english', 'italian', 'spanish'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'country': sparse_tensor.SparseTensor( values=['US', 'IT', 'MX'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]) }, constant_op.constant([[0], [0], [1]]) country_language = feature_column_v2.crossed_column( ['language', 'country'], hash_bucket_size=100) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[country_language], optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def test_crossed_column(self): a = fc.categorical_column_with_vocabulary_list( 'a', vocabulary_list=['1', '2', '3']) b = fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']) ab = fc.crossed_column([a, b], hash_bucket_size=2) cols = [fc.indicator_column(ab)] orig_layer = df.DenseFeatures(cols) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertLen(new_layer._feature_columns, 1) self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data_a = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) data_b = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string) input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string) crossed_data = category_crossing.CategoryCrossing()([input_1, input_2]) hashed_data = hashing.Hashing(num_buckets)(crossed_data) model = keras.Model([input_1, input_2], hashed_data) # FC implementation fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def testPartitionedVariables(self): """Tests LinearClassifier with LinearSDCA with partitioned variables.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0), optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def testMixedFeaturesArbitraryWeights(self): """Tests LinearRegressor with LinearSDCA and a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressor( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)