def movielen_get_fc(): sparse_features = ["movieId", "userId", 'imdbId', 'tmdbId', 'genres'] # movieId 59047 # userId 162541 # imdbId 59047 # tmdbId 61342 # genres 21 ln_embedding = [59047, 162541, 59047, 61342, 21] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dnn_feature_columns = [] linear_feature_columns = [] for i, feat in enumerate(sparse_features): dnn_feature_columns.append( fc.embedding_column( fc.categorical_column_with_hash_bucket(feat, ln_embedding[i], dtype=tf.string), 4)) linear_feature_columns.append( fc.categorical_column_with_hash_bucket(feat, ln_embedding[i], dtype=tf.string)) return dnn_feature_columns, linear_feature_columns
def test_should_be_dense_column(self): with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'): df.DenseFeatures(feature_columns=[ fc.categorical_column_with_hash_bucket('wire_cast', 4) ])(features={ 'a': [[0]] })
def test_from_config(self, trainable, name): cols = [ fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']), dimension=2), fc.indicator_column( fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)) ] orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0) self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b') self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__) self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__) self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
def test_from_config(self, units, sparse_combiner, trainable, name): cols = [ fc.numeric_column('a'), fc.categorical_column_with_vocabulary_list('b', vocabulary_list=('1', '2', '3')), fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3) ] orig_layer = fc._LinearModelLayer(cols, units=units, sparse_combiner=sparse_combiner, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = fc._LinearModelLayer.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer._units, units) self.assertEqual(new_layer._sparse_combiner, sparse_combiner) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].vocabulary_list, ('1', '2', '3')) self.assertEqual(new_layer._feature_columns[2].num_buckets, 3)
def testWeightedSparseFeatures(self): """LinearClassifier with LinearSDCA and weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor( values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = feature_column_v2.weighted_categorical_column( country, 'price') optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[country_weighted_by_price], optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def criteo_get_fc(): ln_embedding = [ 1461, 584, 10131227, 2202608, 306, 24, 12518, 634, 4, 93146, 5684, 8351593, 3195, 28, 14993, 5461306, 11, 5653, 2173, 4, 7046547, 18, 16, 286181, 105, 142572 ] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dense = [] for i in range(13): dense.append( fc.numeric_column("I{}".format(i), dtype=tf.int64, default_value=0)) dnn_feature_columns = [] linear_feature_columns = [] dnn_feature_columns += dense linear_feature_columns += dense sparse_emb = [] for i in range(26): linear_feature_columns.append( fc.categorical_column_with_hash_bucket("C{}".format(i), ln_embedding[i], dtype=tf.int64)) ids = fc.categorical_column_with_hash_bucket("C{}".format(i), ln_embedding[i], dtype=tf.int64) sparse_emb += [fc.embedding_column(ids, param.embedding_size)] dnn_feature_columns += sparse_emb return dnn_feature_columns, linear_feature_columns
def avazu_get_fc(): # nr samples 32747463 ln_embedding = [ 40428967, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60 ] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dnn_feature_columns = [] linear_feature_columns = [] sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', ] \ + ['C' + str(i) for i in range(14, 22)] sparse_emb = [] for i in range(len(sparse_features)): linear_feature_columns.append( fc.categorical_column_with_hash_bucket(sparse_features[i], ln_embedding[i], dtype=tf.string)) ids = fc.categorical_column_with_hash_bucket(sparse_features[i], ln_embedding[i], dtype=tf.string) sparse_emb += [fc.embedding_column(ids, param.embedding_size)] dnn_feature_columns += sparse_emb return dnn_feature_columns, linear_feature_columns
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def testPartitionedVariables(self): """Tests LinearClassifier with LinearSDCA with partitioned variables.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0), optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def _get_categorical_column(params: dict) -> fc.CategoricalColumn: if 'vocabulary' in params.keys(): feature = fc.categorical_column_with_vocabulary_list(params['key'], vocabulary_list=_parse_vocabulary( params['vocabulary']), default_value=0) elif 'bucket_size' in params.keys(): feature = fc.categorical_column_with_hash_bucket(params['key'], hash_bucket_size=params['bucket_size']) elif 'file' in params.keys(): feature = fc.categorical_column_with_vocabulary_file(params['key'], vocabulary_file=params['file'], default_value=0) elif 'num_buckets' in params.keys(): feature = fc.categorical_column_with_identity(params['key'], num_buckets=params['num_buckets']) elif 'boundaries' in params.keys(): feature = fc.bucketized_column(fc.numeric_column( params['key']), boundaries=params['boundaries']) else: raise Exception("params error") return feature
def test_multiple_layers_with_same_embedding_column(self): some_sparse_column = fc.categorical_column_with_hash_bucket( 'sparse_feature', hash_bucket_size=5) some_embedding_column = fc.embedding_column( some_sparse_column, dimension=10) with ops.Graph().as_default(): features = { 'sparse_feature': [['a'], ['x']], } all_cols = [some_embedding_column] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that 2 variables get created in this case. self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) expected_var_names = [ 'dense_features/sparse_feature_embedding/embedding_weights:0', 'dense_features_1/sparse_feature_embedding/embedding_weights:0' ] self.assertItemsEqual( expected_var_names, [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
def testMixedFeaturesArbitraryWeights(self): """Tests LinearRegressor with LinearSDCA and a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressor( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation model = keras.Sequential() model.add( keras.Input(shape=(max_length, ), name="data", ragged=True, dtype=dt.string)) model.add(hashing.Hashing(num_buckets)) # FC implementation fc = fcv2.categorical_column_with_hash_bucket("data", num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def testSparseFeaturesWithL1Reg(self): """Tests LinearRegressor with LinearSDCA and sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[10.0], [10.0], [10.0]]) }, constant_op.constant([[1.4], [-0.8], [2.6]]) price = feature_column_v2.numeric_column('price') country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) # Regressor with no L1 regularization. optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressor( feature_columns=[price, country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] variable_names = regressor.get_variable_names() self.assertIn('linear/linear_model/price/weights', variable_names) self.assertIn('linear/linear_model/country/weights', variable_names) no_l1_reg_weights = { 'linear/linear_model/price/weights': regressor.get_variable_value( 'linear/linear_model/price/weights'), 'linear/linear_model/country/weights': regressor.get_variable_value( 'linear/linear_model/country/weights'), } # Regressor with L1 regularization. optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l1_regularization=1.0, symmetric_l2_regularization=0.1) regressor = linear.LinearRegressor( feature_columns=[price, country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] l1_reg_weights = { 'linear/linear_model/price/weights': regressor.get_variable_value( 'linear/linear_model/price/weights'), 'linear/linear_model/country/weights': regressor.get_variable_value( 'linear/linear_model/country/weights'), } # Unregularized loss is lower when there is no L1 regularization. self.assertLess(no_l1_reg_loss, l1_reg_loss) self.assertLess(no_l1_reg_loss, 0.05) # But weights returned by the regressor with L1 regularization have smaller # L1 norm. l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0 for var_name in sorted(l1_reg_weights): l1_reg_weights_norm += sum( np.absolute(l1_reg_weights[var_name].flatten())) no_l1_reg_weights_norm += sum( np.absolute(no_l1_reg_weights[var_name].flatten())) print('Var name: %s, value: %s' % (var_name, no_l1_reg_weights[var_name].flatten())) self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)