def create_feature_columns(): age = vocabulary_column('age_level', [c for c in range(1, 7)]) gender = vocabulary_column('gender', [-1, 1]) all_cat_cross = crossed_column([age, gender], hash_bucket_size=100) categorical_column = [indicator_column(age), indicator_column(gender)] crossed_columns = [indicator_column(all_cat_cross)] numerical_column = [] range_0_20 = [c for c in range(0, 20)] embedding_columns = [ embedding_column(vocabulary_column("order_cnt", range_0_20), dimension=1), embedding_column(age, dimension=1), embedding_column(gender, dimension=1), embedding_column(all_cat_cross, dimension=10) ] wide_columns = categorical_column + crossed_columns deep_columns = numerical_column + embedding_columns return wide_columns, deep_columns
def test_embedding_column( self, sparse_input_args_a, sparse_input_args_b, expected_input_layer, expected_sequence_length): sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a) sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b) vocabulary_size = 3 embedding_dimension_a = 2 embedding_values_a = ( (1., 2.), # id 0 (3., 4.), # id 1 (5., 6.) # id 2 ) embedding_dimension_b = 3 embedding_values_b = ( (11., 12., 13.), # id 0 (14., 15., 16.), # id 1 (17., 18., 19.) # id 2 ) def _get_initializer(embedding_dimension, embedding_values): def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values return _initializer categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column_a = fc.embedding_column( categorical_column_a, dimension=embedding_dimension_a, initializer=_get_initializer(embedding_dimension_a, embedding_values_a)) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_column_b = fc.embedding_column( categorical_column_b, dimension=embedding_dimension_b, initializer=_get_initializer(embedding_dimension_b, embedding_values_b)) # Test that columns are reordered alphabetically. sequence_input_layer = ksfc.SequenceFeatures( [embedding_column_b, embedding_column_a]) input_layer, sequence_length = sequence_input_layer({ 'aaa': sparse_input_a, 'bbb': sparse_input_b,}) self.evaluate(variables_lib.global_variables_initializer()) weights = sequence_input_layer.weights self.assertCountEqual( ('sequence_features/aaa_embedding/embedding_weights:0', 'sequence_features/bbb_embedding/embedding_weights:0'), tuple([v.name for v in weights])) self.assertAllEqual(embedding_values_a, self.evaluate(weights[0])) self.assertAllEqual(embedding_values_b, self.evaluate(weights[1])) self.assertAllEqual(expected_input_layer, self.evaluate(input_layer)) self.assertAllEqual( expected_sequence_length, self.evaluate(sequence_length))
def movielen_get_fc(): sparse_features = ["movieId", "userId", 'imdbId', 'tmdbId', 'genres'] # movieId 59047 # userId 162541 # imdbId 59047 # tmdbId 61342 # genres 21 ln_embedding = [59047, 162541, 59047, 61342, 21] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dnn_feature_columns = [] linear_feature_columns = [] for i, feat in enumerate(sparse_features): dnn_feature_columns.append( fc.embedding_column( fc.categorical_column_with_hash_bucket(feat, ln_embedding[i], dtype=tf.string), 4)) linear_feature_columns.append( fc.categorical_column_with_hash_bucket(feat, ln_embedding[i], dtype=tf.string)) return dnn_feature_columns, linear_feature_columns
def test_wide_deep_model_with_two_feature_columns(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = fc.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = fc.indicator_column(cat_column) emb_column = fc.embedding_column(cat_column, dimension=5) linear_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') combined_linear = sequential.Sequential( [linear_feature_layer, linear_model]) dnn_model = sequential.Sequential([core.Dense(units=1)]) dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column]) combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model]) wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) opt = gradient_descent.SGD(learning_rate=0.1) wide_deep_model.compile(opt, 'mse', [], run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils. should_run_tf_function()) wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10) self.assertEqual(3, linear_model.inputs[0].shape[1]) self.assertEqual(5, dnn_model.inputs[0].shape[1])
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. embedding_size = 32768 data = fc_bm.create_data(max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int) # Keras implementation model = keras.Sequential() model.add( keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64)) model.add(keras.layers.Embedding(embedding_size, 256)) model.add(keras.layers.Lambda(lambda x: math_ops.reduce_mean(x, axis=-1))) # FC implementation fc = fcv2.embedding_column(fcv2.categorical_column_with_identity( "data", num_buckets=embedding_size - 1), dimension=256) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_from_config(self, trainable, name): cols = [ fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']), dimension=2), fc.indicator_column( fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)) ] orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0) self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b') self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__) self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__) self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [2] # example 2, ids [0, 1] # example 3, ids [] # example 4, ids [1] # example 5, ids [] indices=((1, 0), (2, 0), (2, 1), (4, 0)), values=(2, 0, 1, 1), dense_shape=(6, 2)) expected_sequence_length = [0, 1, 2, 0, 1, 0] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = fc.embedding_column( categorical_column, dimension=2) _, sequence_length, _ = _get_sequence_dense_tensor_state( embedding_column, {'aaa': sparse_input}) self.assertAllEqual( expected_sequence_length, self.evaluate(sequence_length))
def test_with_1d_unknown_shape_sparse_tensor(self): embedding_values = ( (1., 2.), # id 0 (6., 7.), # id 1 (11., 12.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=2, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': array_ops.placeholder(dtypes.float32), 'body-style': array_ops.sparse_placeholder(dtypes.string), # This is dense tensor for the categorical_column. 'country': array_ops.placeholder(dtypes.string), } self.assertIsNone(features['price'].shape.ndims) self.assertIsNone(features['body-style'].get_shape().ndims) self.assertIsNone(features['country'].shape.ndims) price_data = np.array([11., 12.]) body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )) country_data = np.array([['US'], ['CA']]) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 2, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]], sess.run(net, feed_dict={ features['price']: price_data, features['body-style']: body_style_data, features['country']: country_data }))
def test_get_sequence_dense_tensor(self, inputs_args, expected): inputs = sparse_tensor.SparseTensorValue(**inputs_args) vocabulary_size = 3 embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = fc.embedding_column(categorical_column, dimension=embedding_dimension, initializer=_initializer) embedding_lookup, _, state_manager = _get_sequence_dense_tensor_state( embedding_column, {'aaa': inputs}) variables = state_manager._layer.weights self.evaluate(variables_lib.global_variables_initializer()) self.assertCountEqual(('embedding_weights:0', ), tuple([v.name for v in variables])) self.assertAllEqual(embedding_values, self.evaluate(variables[0])) self.assertAllEqual(expected, self.evaluate(embedding_lookup))
def _build_feature_columns(self): col = fc.categorical_column_with_identity('int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx') ] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20) ] return ctx_cols, seq_cols
def _build_feature_columns(self): col = fc.categorical_column_with_identity('int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx') ] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20) ] return ctx_cols, seq_cols
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=5, initializer=_initializer) with ops.Graph().as_default(): # Provides 1-dim tensor and dense tensor. features = { 'price': constant_op.constant([ 11., 12., ]), 'body-style': sparse_tensor.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': constant_op.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures( [price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def test_dense_feature_with_partitioner(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0), (3, 0)), values=(0, 1, 3, 2), dense_shape=(4, 4)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=4) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): offset = partition_info._var_offset[0] del shape # unused del dtype # unused if offset == 0: embedding_values = ( (1, 0), # id 0 (0, 1)) # id 1 else: embedding_values = ( (1, 1), # id 2 (2, 2)) # id 3 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures( [embedding_column], partitioner=partitioned_variables.fixed_size_partitioner(2)) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(2, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(2, len(variables)) self.assertIs(variables[0], dense_features.variables[0]) self.assertIs(variables[1], dense_features.variables[1])
def _replace_edl_embedding_column_with_tf(dense_features_layer): new_feature_columns = [] for column in dense_features_layer._feature_columns: if isinstance(column, EmbeddingColumn): logger.info("Replace embedding_column {} from ElasticDL " "version to TF version".format(column.name)) new_column = fc_lib.embedding_column(column.categorical_column, dimension=column.dimension) new_feature_columns.append(new_column) else: new_feature_columns.append(column) return tf.keras.layers.DenseFeatures(feature_columns=new_feature_columns, name=dense_features_layer.name)
def test_sequence_length(self, inputs_args, expected_sequence_length): inputs = sparse_tensor.SparseTensorValue(**inputs_args) vocabulary_size = 3 categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = fc.embedding_column(categorical_column, dimension=2) _, sequence_length, _ = _get_sequence_dense_tensor_state( embedding_column, {'aaa': inputs}) sequence_length = self.evaluate(sequence_length) self.assertAllEqual(expected_sequence_length, sequence_length) self.assertEqual(np.int64, sequence_length.dtype)
def test_get_config(self, trainable, name): cols = [fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_identity( key='b', num_buckets=3), dimension=2)] orig_layer = fc.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() self.assertEqual(config['name'], orig_layer.name) self.assertEqual(config['trainable'], trainable) self.assertLen(config['feature_columns'], 2) self.assertEqual( config['feature_columns'][0]['class_name'], 'NumericColumn') self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,)) self.assertEqual( config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
def test_feature_column_dense_features_gradient(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} def scale_matrix(): matrix = dense_features(features) return 2 * matrix # Sanity check: Verify that scale_matrix returns the correct output. self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) # Check that the returned gradient is correct. grad_function = backprop.implicit_grad(scale_matrix) grads_and_vars = grad_function() indexed_slice = grads_and_vars[0][0] gradient = grads_and_vars[0][0].values self.assertAllEqual([0, 1, 2], indexed_slice.indices) self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
def test_reuses_variables(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(1, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(1, len(variables)) self.assertEqual(variables[0], dense_features.variables[0])
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. embedding_size = 32768 data = fc_bm.create_data(max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int) weight = array_ops.ones_like_v2(data, dtype=dt.float32) # Keras implementation data_input = keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64) weight_input = keras.Input(shape=(None, ), ragged=True, name="weight", dtype=dt.float32) embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input) weighted_embedding = math_ops.multiply( embedded_data, array_ops.expand_dims(weight_input, -1)) reduced_embedding = math_ops.reduce_sum(weighted_embedding, axis=1) model = keras.Model([data_input, weight_input], reduced_embedding) # FC implementation fc = fcv2.embedding_column(fcv2.weighted_categorical_column( fcv2.categorical_column_with_identity("data", num_buckets=embedding_size - 1), weight_feature_key="weight"), dimension=256) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data, "weight": weight} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def DISABLED_test_train_with_dense_features_v2(self): feature_dict = { 'sex': np.int64([1, 1, 1, 1, 0]), 'cp': np.int64([0, 3, 3, 2, 1]), 'slope': np.int64([3, 2, 0, 3, 1]), } label = np.int64([0, 1, 0, 0, 0]) train_input_fn = numpy_io.numpy_input_fn(x=feature_dict, y=label, num_epochs=1, shuffle=False) feature_columns = list() input_features = dict() for feature_name, data_array in feature_dict.items(): feature_columns.append( feature_column.embedding_column( feature_column.categorical_column_with_identity( key=feature_name, num_buckets=np.size(np.unique(data_array))), dimension=3)) input_features[feature_name] = keras.layers.Input( name=feature_name, shape=(np.size(np.unique(data_array)), ), dtype=dtypes.int64) df = dense_features_v2.DenseFeatures(feature_columns) x = df(input_features) x = keras.layers.Dense(16, activation='relu')(x) logits = keras.layers.Dense(1, activation='linear')(x) model = keras.Model(inputs=input_features, outputs=logits) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) estimator_model = keras_lib.model_to_estimator(keras_model=model) estimator_model.train(input_fn=train_input_fn, steps=5) # We assert that we find the embedding_weights variables in the dependencies # for the DenseFeatures layer. dependency_names = [x.name for x in df._checkpoint_dependencies] self.assertNotIn('embedding_weights', dependency_names) self.assertIn('cp_embedding/embedding_weights', dependency_names) self.assertIn('sex_embedding/embedding_weights', dependency_names) self.assertIn('slope_embedding/embedding_weights', dependency_names)
def test_embedding_column_with_non_sequence_categorical(self): """Tests that error is raised for non-sequence embedding column.""" vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column_a = fc.embedding_column(categorical_column_a, dimension=2) sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a]) with self.assertRaisesRegex( ValueError, r'In embedding_column: aaa_embedding\. categorical_column must be of ' r'type SequenceCategoricalColumn to use SequenceFeatures\.'): _, _ = sequence_input_layer({'aaa': sparse_input})
def testFeatureColumns(self): # TODO(b/120099662): Error with table initialization with Keras models in # graph mode. if context.executing_eagerly(): numeric = fc.numeric_column('a') bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15]) cat_vocab = fc.categorical_column_with_vocabulary_list( 'b', ['1', '2', '3']) one_hot = fc.indicator_column(cat_vocab) embedding = fc.embedding_column(cat_vocab, dimension=8) feature_layer = DenseFeatures([bucketized, one_hot, embedding]) model = keras.models.Sequential(feature_layer) features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])} predictions = model.predict(features) saved_model_dir = self._save_model_dir() model.save(saved_model_dir, save_format='tf') loaded = keras_load.load(saved_model_dir) loaded_predictions = loaded.predict(features) self.assertAllClose(predictions, loaded_predictions)
def criteo_get_fc(): ln_embedding = [ 1461, 584, 10131227, 2202608, 306, 24, 12518, 634, 4, 93146, 5684, 8351593, 3195, 28, 14993, 5461306, 11, 5653, 2173, 4, 7046547, 18, 16, 286181, 105, 142572 ] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dense = [] for i in range(13): dense.append( fc.numeric_column("I{}".format(i), dtype=tf.int64, default_value=0)) dnn_feature_columns = [] linear_feature_columns = [] dnn_feature_columns += dense linear_feature_columns += dense sparse_emb = [] for i in range(26): linear_feature_columns.append( fc.categorical_column_with_hash_bucket("C{}".format(i), ln_embedding[i], dtype=tf.int64)) ids = fc.categorical_column_with_hash_bucket("C{}".format(i), ln_embedding[i], dtype=tf.int64) sparse_emb += [fc.embedding_column(ids, param.embedding_size)] dnn_feature_columns += sparse_emb return dnn_feature_columns, linear_feature_columns
def test_multiple_layers_with_same_embedding_column(self): some_sparse_column = fc.categorical_column_with_hash_bucket( 'sparse_feature', hash_bucket_size=5) some_embedding_column = fc.embedding_column( some_sparse_column, dimension=10) with ops.Graph().as_default(): features = { 'sparse_feature': [['a'], ['x']], } all_cols = [some_embedding_column] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that 2 variables get created in this case. self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) expected_var_names = [ 'dense_features/sparse_feature_embedding/embedding_weights:0', 'dense_features_1/sparse_feature_embedding/embedding_weights:0' ] self.assertItemsEqual( expected_var_names, [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
def avazu_get_fc(): # nr samples 32747463 ln_embedding = [ 40428967, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60 ] param.max_bucket = find_bucket_size(ln_embedding) print("TensorFlow, max_bucket_size = ", param.max_bucket) if param.max_bucket is not None: ln_embedding = [ each if each < param.max_bucket else param.max_bucket for each in ln_embedding ] dnn_feature_columns = [] linear_feature_columns = [] sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', ] \ + ['C' + str(i) for i in range(14, 22)] sparse_emb = [] for i in range(len(sparse_features)): linear_feature_columns.append( fc.categorical_column_with_hash_bucket(sparse_features[i], ln_embedding[i], dtype=tf.string)) ids = fc.categorical_column_with_hash_bucket(sparse_features[i], ln_embedding[i], dtype=tf.string) sparse_emb += [fc.embedding_column(ids, param.embedding_size)] dnn_feature_columns += sparse_emb return dnn_feature_columns, linear_feature_columns
def make_feature_config(num_players): return FeatureConfig( context_features=[ fc.numeric_column( "public_context__starting_stack_sizes", shape=num_players, dtype=tf.int64, ), fc.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( "private_context__hand_encoded", range(1326)), dimension=4, ), ], sequence_features=[ fc.indicator_column( sfc.sequence_categorical_column_with_identity( "last_action__action_encoded", 22)), fc.indicator_column( sfc.sequence_categorical_column_with_identity( "last_action__move", 5)), sfc.sequence_numeric_column( "last_action__amount_added", dtype=tf.int64, default_value=-1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "last_action__amount_added_percent_of_remaining", dtype=tf.float32, default_value=-1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "last_action__amount_raised", dtype=tf.int64, default_value=-1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "last_action__amount_raised_percent_of_pot", dtype=tf.float32, default_value=-1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__all_in_player_mask", dtype=tf.int64, default_value=-1, shape=num_players, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__stack_sizes", dtype=tf.int64, default_value=-1, shape=num_players, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__amount_to_call", dtype=tf.int64, default_value=-1, shape=num_players, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__current_player_mask", dtype=tf.int64, default_value=-1, shape=num_players, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__min_raise_amount", dtype=tf.int64, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__pot_size", dtype=tf.int64, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "public_state__street", dtype=tf.int64, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__is_current_player", dtype=tf.int64, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__current_player_offset", dtype=tf.int64, default_value=-1, shape=1, normalizer_fn=make_float, ), fc.indicator_column( sfc.sequence_categorical_column_with_identity( "player_state__current_hand_type", 9)), sfc.sequence_numeric_column( "player_state__win_odds", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__win_odds_vs_better", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__win_odds_vs_tied", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__win_odds_vs_worse", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__frac_better_hands", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__frac_tied_hands", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), sfc.sequence_numeric_column( "player_state__frac_worse_hands", dtype=tf.float32, default_value=-1, shape=1, normalizer_fn=make_float, ), ], context_targets=[ fc.numeric_column("public_context__num_players", shape=1, dtype=tf.int64), ], sequence_targets=[ sfc.sequence_numeric_column("next_action__action_encoded", dtype=tf.int64, default_value=-1), sfc.sequence_numeric_column("reward__cumulative_reward", dtype=tf.int64, default_value=-1), sfc.sequence_numeric_column("public_state__pot_size", dtype=tf.int64, default_value=-1), sfc.sequence_numeric_column("player_state__is_current_player", dtype=tf.int64, default_value=-1), sfc.sequence_numeric_column("public_state__num_players_remaining", dtype=tf.int64, default_value=-1), ], )
def test_dense_features(self, use_safe_embedding_lookup, partition_variables): # Inputs. vocabulary_size = 4 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.), # id 2 (9., 13.) # id 3 ) def _initializer(shape, dtype, partition_info=None): if partition_variables: self.assertEqual([vocabulary_size, embedding_dimension], partition_info.full_shape) self.assertAllEqual((2, embedding_dimension), shape) else: self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertIsNone(partition_info) self.assertEqual(dtypes.float32, dtype) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) partitioner = None if partition_variables: partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0) with variable_scope.variable_scope('vars', partitioner=partitioner): embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer, use_safe_embedding_lookup=use_safe_embedding_lookup) # Provide sparse input and get dense result. l = df.DenseFeatures((embedding_column, )) dense_features = l({'aaa': sparse_input}) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) if partition_variables: self.assertCountEqual(( 'vars/dense_features/aaa_embedding/embedding_weights/part_0:0', 'vars/dense_features/aaa_embedding/embedding_weights/part_1:0' ), tuple([v.name for v in global_vars])) else: self.assertCountEqual( ('vars/dense_features/aaa_embedding/embedding_weights:0', ), tuple([v.name for v in global_vars])) for v in global_vars: self.assertIsInstance(v, variables_lib.Variable) trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) if partition_variables: self.assertCountEqual(( 'vars/dense_features/aaa_embedding/embedding_weights/part_0:0', 'vars/dense_features/aaa_embedding/embedding_weights/part_1:0' ), tuple([v.name for v in trainable_vars])) else: self.assertCountEqual( ('vars/dense_features/aaa_embedding/embedding_weights:0', ), tuple([v.name for v in trainable_vars])) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0])) self.assertAllEqual(expected_lookups, self.evaluate(dense_features)) if use_safe_embedding_lookup: self.assertIn( 'SparseFillEmptyRows', [x.type for x in ops.get_default_graph().get_operations()]) else: self.assertNotIn( 'SparseFillEmptyRows', [x.type for x in ops.get_default_graph().get_operations()])
def test_dense_features_not_trainable(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = fc.embedding_column(categorical_column, dimension=embedding_dimension, initializer=_initializer, trainable=False) # Provide sparse input and get dense result. dense_features = df.DenseFeatures((embedding_column, ))({ 'aaa': sparse_input }) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertCountEqual( ('dense_features/aaa_embedding/embedding_weights:0', ), tuple([v.name for v in global_vars])) self.assertCountEqual([], ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES)) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllEqual(embedding_values, self.evaluate(global_vars[0])) self.assertAllEqual(expected_lookups, self.evaluate(dense_features))