def test_multiple_layers_with_same_shared_embedding_column(self): categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) with ops.Graph().as_default(): features = { 'aaa': sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2)), 'bbb': sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 1), dense_shape=(2, 2)), } all_cols = [embedding_column_a, embedding_column_b] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that only 1 variable gets created in this case. self.assertEqual( 1, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) self.assertItemsEqual(['aaa_bbb_shared_embedding:0'], [ v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ])
def test_from_config(self, trainable, name): cols = [ fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']), dimension=2), fc.indicator_column( fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)) ] orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0) self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b') self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__) self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__) self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
def test_saving_with_dense_features(self): cols = [ feature_column_lib.numeric_column('a'), feature_column_lib.indicator_column( feature_column_lib.categorical_column_with_vocabulary_list( 'b', ['one', 'two'])) ] input_layers = { 'a': keras.layers.Input(shape=(1,), name='a'), 'b': keras.layers.Input(shape=(1,), name='b', dtype='string') } fc_layer = dense_features.DenseFeatures(cols)(input_layers) output = keras.layers.Dense(10)(fc_layer) model = keras.models.Model(input_layers, output) model.compile( loss=keras.losses.MSE, optimizer='rmsprop', metrics=[keras.metrics.categorical_accuracy]) config = model.to_json() loaded_model = model_config.model_from_json(config) inputs_a = np.arange(10).reshape(10, 1) inputs_b = np.arange(10).reshape(10, 1).astype('str') with self.cached_session(): # Initialize tables for V1 lookup. if not context.executing_eagerly(): self.evaluate(lookup_ops.tables_initializer()) self.assertLen(loaded_model.predict({'a': inputs_a, 'b': inputs_b}), 10)
def test_shared_sequence_non_sequence_into_input_layer(self): non_seq = fc.categorical_column_with_identity('non_seq', num_buckets=10) seq = sfc.sequence_categorical_column_with_identity('seq', num_buckets=10) shared_non_seq, shared_seq = fc.shared_embedding_columns_v2( [non_seq, seq], dimension=4, combiner='sum', initializer=init_ops_v2.Ones(), shared_embedding_collection_name='shared') seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) non_seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) features = {'seq': seq, 'non_seq': non_seq} # Tile the context features across the sequence features seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features) non_seq_input = dense_features.DenseFeatures([shared_non_seq ])(features) with self.cached_session() as sess: sess.run(variables.global_variables_initializer()) output_seq, output_seq_length, output_non_seq = sess.run( [seq_input, seq_length, non_seq_input]) self.assertAllEqual( output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]]) self.assertAllEqual(output_seq_length, [2, 1]) self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
def test_with_1d_unknown_shape_sparse_tensor(self): embedding_values = ( (1., 2.), # id 0 (6., 7.), # id 1 (11., 12.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=2, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': array_ops.placeholder(dtypes.float32), 'body-style': array_ops.sparse_placeholder(dtypes.string), # This is dense tensor for the categorical_column. 'country': array_ops.placeholder(dtypes.string), } self.assertIsNone(features['price'].shape.ndims) self.assertIsNone(features['body-style'].get_shape().ndims) self.assertIsNone(features['country'].shape.ndims) price_data = np.array([11., 12.]) body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )) country_data = np.array([['US'], ['CA']]) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 2, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]], sess.run(net, feed_dict={ features['price']: price_data, features['body-style']: body_style_data, features['country']: country_data }))
def test_should_be_dense_column(self): with self.assertRaisesRegex(ValueError, 'must be a .*DenseColumn'): df.DenseFeatures(feature_columns=[ fc.categorical_column_with_hash_bucket('wire_cast', 4) ])(features={ 'a': [[0]] })
def test_column_order(self): price_a = fc.numeric_column('price_a') price_b = fc.numeric_column('price_b') with ops.Graph().as_default(): features = { 'price_a': [[1.]], 'price_b': [[3.]], } net1 = df.DenseFeatures([price_a, price_b])(features) net2 = df.DenseFeatures([price_b, price_a])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 3.]], self.evaluate(net1)) self.assertAllClose([[1., 3.]], self.evaluate(net2))
def test_does_not_support_dict_columns(self): with self.assertRaisesRegex( ValueError, 'Expected feature_columns to be iterable, found dict.'): df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})( features={ 'a': [[0]] })
def test_raises_if_shape_mismatch(self): price = fc.numeric_column('price', shape=2) with ops.Graph().as_default(): features = {'price': [[1.], [5.]]} with self.assertRaisesRegex( Exception, r'Cannot reshape a tensor with 2 elements to shape \[2,2\]' ): df.DenseFeatures([price])(features)
def test_raises_if_duplicate_name(self): with self.assertRaisesRegex( ValueError, 'Duplicate feature column name found for columns'): df.DenseFeatures(feature_columns=[ fc.numeric_column('a'), fc.numeric_column('a') ])(features={ 'a': [[0]] })
def test_bare_column(self): with ops.Graph().as_default(): features = features = {'a': [0.]} net = df.DenseFeatures(fc.numeric_column('a'))(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0.]], self.evaluate(net))
def test_save_load_with_dense_features(self, tmpdir, api, loss, optimizer, metrics): if optimizer is None: pytest.skip() cols = [ feature_column_lib.numeric_column("a"), feature_column_lib.indicator_column( feature_column_lib.categorical_column_with_vocabulary_list( "b", ["one", "two"])), ] input_layers = { "a": keras.layers.Input(shape=(1, ), name="a"), "b": keras.layers.Input(shape=(1, ), name="b", dtype="string"), } fc_layer = dense_features.DenseFeatures(cols)(input_layers) output = keras.layers.Dense(10)(fc_layer) model = keras.models.Model(input_layers, output) model.compile( loss=loss, optimizer=optimizer, metrics=[metrics], ) tiledb_uri = os.path.join(tmpdir, "model_array") tiledb_model_obj = TensorflowKerasTileDBModel(uri=tiledb_uri, model=model) tiledb_model_obj.save(include_optimizer=True) loaded_model = tiledb_model_obj.load(compile_model=True) model_opt_weights = batch_get_value(getattr(model.optimizer, "weights")) loaded_opt_weights = batch_get_value( getattr(loaded_model.optimizer, "weights")) # Assert optimizer weights are equal for weight_model, weight_loaded_model in zip(model_opt_weights, loaded_opt_weights): np.testing.assert_array_equal(weight_model, weight_loaded_model) inputs_a = np.arange(10).reshape(10, 1) inputs_b = np.arange(10).reshape(10, 1).astype("str") # Assert model predictions are equal np.testing.assert_array_equal( loaded_model.predict({ "a": inputs_a, "b": inputs_b }), model.predict({ "a": inputs_a, "b": inputs_b }), )
def test_reshaping(self): price = fc.numeric_column('price', shape=[1, 2]) with ops.Graph().as_default(): features = {'price': [[[1., 2.]], [[5., 6.]]]} net = df.DenseFeatures([price])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
def test_column_generator(self): with ops.Graph().as_default(): features = features = {'a': [0.], 'b': [1.]} columns = (fc.numeric_column(key) for key in features) net = df.DenseFeatures(columns)(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0., 1.]], self.evaluate(net))
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=5, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': constant_op.constant([ 11., 12., ]), 'body-style': sparse_tensor.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': constant_op.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def test_fails_for_categorical_column(self): animal = fc.categorical_column_with_identity('animal', num_buckets=4) with ops.Graph().as_default(): features = { 'animal': sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) } with self.assertRaisesRegex(Exception, 'must be a .*DenseColumn'): df.DenseFeatures([animal])(features)
def test_dense_feature_with_partitioner(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0), (3, 0)), values=(0, 1, 3, 2), dense_shape=(4, 4)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=4) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): offset = partition_info._var_offset[0] del shape # unused del dtype # unused if offset == 0: embedding_values = ( (1, 0), # id 0 (0, 1)) # id 1 else: embedding_values = ( (1, 1), # id 2 (2, 2)) # id 3 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures( [embedding_column], partitioner=partitioned_variables.fixed_size_partitioner(2)) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(2, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(2, len(variables)) self.assertIs(variables[0], dense_features.variables[0]) self.assertIs(variables[1], dense_features.variables[1])
def test_multi_column(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} net = df.DenseFeatures([price1, price2])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def test_static_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': [[1.], [5.], [7.]], # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } with self.assertRaisesRegex( ValueError, r'Batch size \(first dimension\) of each feature must be same.' ): # pylint: disable=anomalous-backslash-in-string df.DenseFeatures([price1, price2])(features)
def test_with_rank_0_feature(self): # price has 1 dimension in dense_features price = fc.numeric_column('price') features = { 'price': constant_op.constant(0), } self.assertEqual(0, features['price'].shape.ndims) # Static rank 0 should fail with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'): df.DenseFeatures([price])(features) # Dynamic rank 0 should fail features = { 'price': array_ops.placeholder(dtypes.float32), } net = df.DenseFeatures([price])(features) self.assertEqual(1, net.shape[1]) with _initialized_session() as sess: with self.assertRaisesOpError('Feature .* cannot have rank 0'): sess.run(net, feed_dict={features['price']: np.array(1)})
def test_trace_features_layer(self): columns = [feature_column_lib.numeric_column('x')] model = sequential.Sequential([dense_features.DenseFeatures(columns)]) model_input = {'x': constant_op.constant([[1.]])} model.predict(model_input, steps=1) fn = saving_utils.trace_model_call(model) self.assertAllClose({'output_1': [[1.]]}, fn({'x': [[1.]]})) columns = [ feature_column_lib.numeric_column('x'), feature_column_lib.numeric_column('y') ] model = sequential.Sequential([dense_features.DenseFeatures(columns)]) model_input = { 'x': constant_op.constant([[1.]]), 'y': constant_op.constant([[2.]]) } model.predict(model_input, steps=1) fn = saving_utils.trace_model_call(model) self.assertAllClose({'output_1': [[1., 2.]]}, fn({ 'x': [[1.]], 'y': [[2.]] }))
def test_crossed_column(self): a = fc.categorical_column_with_vocabulary_list( 'a', vocabulary_list=['1', '2', '3']) b = fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']) ab = fc.crossed_column([a, b], hash_bucket_size=2) cols = [fc.indicator_column(ab)] orig_layer = df.DenseFeatures(cols) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertLen(new_layer._feature_columns, 1) self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
def test_multiple_layers_with_same_embedding_column(self): some_sparse_column = fc.categorical_column_with_hash_bucket( 'sparse_feature', hash_bucket_size=5) some_embedding_column = fc.embedding_column(some_sparse_column, dimension=10) with ops.Graph().as_default(): features = { 'sparse_feature': [['a'], ['x']], } all_cols = [some_embedding_column] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that 2 variables get created in this case. self.assertEqual( 2, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) expected_var_names = [ 'dense_features/sparse_feature_embedding/embedding_weights:0', 'dense_features_1/sparse_feature_embedding/embedding_weights:0' ] self.assertItemsEqual(expected_var_names, [ v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ])
def test_dense_features(self): animal = fc.indicator_column( fc.categorical_column_with_identity('animal', num_buckets=4)) with ops.Graph().as_default(): features = { 'animal': sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) } net = df.DenseFeatures([animal])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
def test_runtime_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: with self.assertRaisesRegex( errors.OpError, 'Dimensions of inputs should match'): sess.run( net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
def test_subset_of_static_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') price3 = fc.numeric_column('price3') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 'price2': [[3.], [4.]], # batchsize = 2 'price3': [[3.], [4.], [5.]] # batchsize = 3 } with self.assertRaisesRegex( ValueError, r'Batch size \(first dimension\) of each feature must be same.' ): # pylint: disable=anomalous-backslash-in-string df.DenseFeatures([price1, price2, price3])(features)
def test_feature_column_dense_features_gradient(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} def scale_matrix(): matrix = dense_features(features) return 2 * matrix # Sanity check: Verify that scale_matrix returns the correct output. self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) # Check that the returned gradient is correct. grad_function = backprop.implicit_grad(scale_matrix) grads_and_vars = grad_function() indexed_slice = grads_and_vars[0][0] gradient = grads_and_vars[0][0].values self.assertAllEqual([0, 1, 2], indexed_slice.indices) self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
def test_runtime_batch_size_matches(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: sess.run(net, feed_dict={ features['price1']: [[1.], [5.]], features['price2']: [[1.], [5.]], })
def test_cols_to_output_tensors(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): cols_dict = {} features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} dense_features = df.DenseFeatures([price1, price2]) net = dense_features(features, cols_dict) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(cols_dict[price1])) self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2])) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def test_sequential_model(self): columns = [fc.numeric_column('a')] model = keras.models.Sequential([ df.DenseFeatures(columns), keras.layers.Dense(64, activation='relu'), keras.layers.Dense(20, activation='softmax') ]) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=testing_utils.should_run_eagerly()) x = {'a': np.random.random((10, 1))} y = np.random.randint(20, size=(10, 1)) y = np_utils.to_categorical(y, num_classes=20) model.fit(x, y, epochs=1, batch_size=5) model.fit(x, y, epochs=1, batch_size=5) model.evaluate(x, y, batch_size=5) model.predict(x, batch_size=5)