def test_multiple_layers_with_same_shared_embedding_column(self): categorical_column_a = tf.feature_column.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = tf.feature_column.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 # feature_column.shared_embeddings is not supported in eager. with tf.Graph().as_default(): embedding_column_b, embedding_column_a = tf.feature_column.shared_embeddings( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) features = { 'aaa': tf.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2)), 'bbb': tf.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 1), dense_shape=(2, 2)), } all_cols = [embedding_column_a, embedding_column_b] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that only 1 variable gets created in this case. self.assertEqual( 1, len( tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES))) self.assertItemsEqual(['aaa_bbb_shared_embedding:0'], [ v.name for v in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES) ])
def test_shared_sequence_non_sequence_into_input_layer(self): non_seq = tf.feature_column.categorical_column_with_identity( 'non_seq', num_buckets=10) seq = tf.feature_column.sequence_categorical_column_with_identity( 'seq', num_buckets=10) shared_non_seq, shared_seq = tf.feature_column.shared_embeddings( [non_seq, seq], dimension=4, combiner='sum', initializer=tf.ones_initializer(), shared_embedding_collection_name='shared') seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) non_seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) features = {'seq': seq, 'non_seq': non_seq} # Tile the context features across the sequence features seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features) non_seq_input = dense_features.DenseFeatures([shared_non_seq ])(features) with self.cached_session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) output_seq, output_seq_length, output_non_seq = sess.run( [seq_input, seq_length, non_seq_input]) self.assertAllEqual( output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]]) self.assertAllEqual(output_seq_length, [2, 1]) self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
def test_crossing_sparse_inputs_depth_tuple(self): layer = category_crossing.CategoryCrossing(depth=(2, 3)) inputs_0 = tf.SparseTensor( indices=[[0, 0], [1, 0], [2, 0]], values=['a', 'b', 'c'], dense_shape=[3, 1]) inputs_1 = tf.SparseTensor( indices=[[0, 0], [1, 0], [2, 0]], values=['d', 'e', 'f'], dense_shape=[3, 1]) inputs_2 = tf.SparseTensor( indices=[[0, 0], [1, 0], [2, 0]], values=['g', 'h', 'i'], dense_shape=[3, 1]) inp_0_t = input_layer.Input(shape=(1,), sparse=True, dtype=tf.string) inp_1_t = input_layer.Input(shape=(1,), sparse=True, dtype=tf.string) inp_2_t = input_layer.Input(shape=(1,), sparse=True, dtype=tf.string) out_t = layer([inp_0_t, inp_1_t, inp_2_t]) model = training.Model([inp_0_t, inp_1_t, inp_2_t], out_t) output = model.predict([inputs_0, inputs_1, inputs_2]) self.assertIsInstance(output, tf.SparseTensor) output = tf.sparse.to_dense(output) expected_outputs_0 = [[b'a_X_d', b'a_X_g', b'd_X_g', b'a_X_d_X_g']] expected_outputs_1 = [[b'b_X_e', b'b_X_h', b'e_X_h', b'b_X_e_X_h']] expected_outputs_2 = [[b'c_X_f', b'c_X_i', b'f_X_i', b'c_X_f_X_i']] expected_out = tf.concat( [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0) self.assertAllEqual(expected_out, output)
def test_works_with_registered(self): class CustomClass: def value(self): return tf.convert_to_tensor(42.) tf.register_tensor_conversion_function( CustomClass, lambda value, **_: value.value()) tf_utils.register_symbolic_tensor_type(CustomClass) if tf.executing_eagerly(): self.assertFalse(tf_utils.is_symbolic_tensor( tf.Variable(name='blah', initial_value=0.))) self.assertFalse( tf_utils.is_symbolic_tensor( tf.convert_to_tensor(0.))) self.assertFalse(tf_utils.is_symbolic_tensor( tf.SparseTensor( indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))) self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass())) else: self.assertTrue(tf_utils.is_symbolic_tensor( tf.Variable(name='blah', initial_value=0.))) self.assertTrue( tf_utils.is_symbolic_tensor( tf.convert_to_tensor(0.))) self.assertTrue(tf_utils.is_symbolic_tensor( tf.SparseTensor( indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))) self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
def test_compute_output_shape(self): price1 = tf.feature_column.sequence_numeric_column('price1', shape=2) price2 = tf.feature_column.sequence_numeric_column('price2') features = { 'price1': tf.SparseTensor(indices=[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1], [2, 0, 0], [2, 0, 1], [3, 0, 0], [3, 0, 1]], values=[ 0., 1., 10., 11., 100., 101., 200., 201., 300., 301. ], dense_shape=(4, 3, 2)), 'price2': tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0], [3, 0]], values=[10., 11., 20., 30., 40.], dense_shape=(4, 3)) } sequence_features = ksfc.SequenceFeatures([price1, price2]) seq_input, seq_len = sequence_features(features) self.assertEqual(sequence_features.compute_output_shape((None, None)), (None, None, 3)) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.tables_initializer()) self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]], [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]], [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]], [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]], self.evaluate(seq_input)) self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
def test_sparse_input_sparse_output_with_weights(self): indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]] sp_inp = tf.SparseTensor( indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2]) input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True) sp_weight = tf.SparseTensor( indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2]) weight_data = keras.Input(shape=(None,), dtype=tf.float32, sparse=True) # The expected output should be (X for missing value): # [[1, X, X, X] # [X, X, 1, X] # [X, 2, X, X] # [1, X, X, X]] expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]] expected_values = [.1, .2, .7, .2] num_tokens = 6 layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True) int_data = layer(input_data, count_weights=weight_data) model = keras.Model(inputs=[input_data, weight_data], outputs=int_data) sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1) self.assertAllClose(expected_values, sp_output_dataset.values) self.assertAllEqual(expected_indices, sp_output_dataset.indices)
def test_default_behavior(self): if tf.executing_eagerly(): self.assertFalse( tf_utils.is_symbolic_tensor( tf.Variable(name="blah", initial_value=0.0))) self.assertFalse( tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))) self.assertFalse( tf_utils.is_symbolic_tensor( tf.SparseTensor( indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4], ))) else: self.assertTrue( tf_utils.is_symbolic_tensor( tf.Variable(name="blah", initial_value=0.0))) self.assertTrue( tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))) self.assertTrue( tf_utils.is_symbolic_tensor( tf.SparseTensor( indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4], )))
def test_saving_with_sequence_features(self): cols = [ tf.feature_column.sequence_numeric_column('a'), tf.feature_column.indicator_column( tf.feature_column. sequence_categorical_column_with_vocabulary_list( 'b', ['one', 'two'])) ] input_layers = { 'a': keras.layers.Input(shape=(None, 1), sparse=True, name='a'), 'b': keras.layers.Input(shape=(None, 1), sparse=True, name='b', dtype='string') } fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers) # TODO(tibell): Figure out the right dtype and apply masking. # sequence_length_mask = array_ops.sequence_mask(sequence_length) # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask) x = keras.layers.GRU(32)(fc_layer) output = keras.layers.Dense(10)(x) model = keras.models.Model(input_layers, output) model.compile(loss=keras.losses.MSE, optimizer='rmsprop', metrics=[keras.metrics.categorical_accuracy]) config = model.to_json() loaded_model = model_config.model_from_json(config) batch_size = 10 timesteps = 1 values_a = np.arange(10, dtype=np.float32) indices_a = np.zeros((10, 3), dtype=np.int64) indices_a[:, 0] = np.arange(10) inputs_a = tf.SparseTensor(indices_a, values_a, (batch_size, timesteps, 1)) values_b = np.zeros(10, dtype=np.str) indices_b = np.zeros((10, 3), dtype=np.int64) indices_b[:, 0] = np.arange(10) inputs_b = tf.SparseTensor(indices_b, values_b, (batch_size, timesteps, 1)) with self.cached_session(): # Initialize tables for V1 lookup. if not tf.executing_eagerly(): self.evaluate(tf.compat.v1.tables_initializer()) self.assertLen( loaded_model.predict({ 'a': inputs_a, 'b': inputs_b }, steps=1), batch_size)
def test_sparse_concatenation(self): tensor_1 = tf.SparseTensor([[0, 0]], [1], [1, 1]) tensor_2 = tf.SparseTensor([[0, 0]], [2], [1, 1]) concatenated_tensor = training_utils_v1._append_composite_tensor( tensor_1, tensor_2) evaluated_tensor = self.evaluate(concatenated_tensor) self.assertAllEqual(evaluated_tensor.indices, [[0, 0], [1, 0]]) self.assertAllEqual(evaluated_tensor.values, [1, 2]) self.assertAllEqual(evaluated_tensor.dense_shape, [2, 1])
def _make_sparse_tensor_dict(): rel_name1 = 'real_stuff' # Note, these matrices are transposed. sparse_tensor1 = tf.SparseTensor( indices=[[0, 0], [99, 1]], values=[1., 2.], dense_shape=[100, 2]) rel_name2 = 'other_stuff' sparse_tensor2 = tf.SparseTensor( indices=[[100, 0]], values=[3.], dense_shape=[1000, 2]) return {rel_name1: sparse_tensor1, rel_name2: sparse_tensor2}
def test_crossing_sparse_inputs_empty_sep(self): layer = category_crossing.CategoryCrossing(separator='') inputs_0 = tf.SparseTensor( indices=[[0, 0], [1, 0], [1, 1]], values=['a', 'b', 'c'], dense_shape=[2, 2]) inputs_1 = tf.SparseTensor( indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3]) output = layer([inputs_0, inputs_1]) self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices) self.assertAllEqual([b'ad', b'be', b'ce'], output.values)
def test_crossing_sparse_inputs_depth_int(self): layer = category_crossing.CategoryCrossing(depth=1) inputs_0 = tf.SparseTensor(indices=[[0, 0], [1, 0], [2, 0]], values=['a', 'b', 'c'], dense_shape=[3, 1]) inputs_1 = tf.SparseTensor(indices=[[0, 0], [1, 0], [2, 0]], values=['d', 'e', 'f'], dense_shape=[3, 1]) output = layer([inputs_0, inputs_1]) self.assertIsInstance(output, tf.SparseTensor) output = tf.sparse.to_dense(output) expected_out = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']] self.assertAllEqual(expected_out, output)
def test_sparse_tensors(self, use_dict, use_dataset, action): data = [ ( tf.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]], [1, 2, 3], [2, 1, 3]), np.array([[[1, -1, -1]], [[2, 3, -1]]]), ), ( tf.SparseTensor( [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8], [3, 1, 4], ), np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]], [[-1, 8, -1, -1]]]), ), ] # Prepare the model to test. input_name = get_input_name(use_dict) model_input = input_layer.Input(shape=(1, None), sparse=True, name=input_name, dtype=tf.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"], **get_test_mode_kwargs()) kwargs = get_kwargs(use_dataset, action) # Prepare the input data for data_element in data: input_data, expected_output = prepare_inputs( data_element, use_dict, use_dataset, action, input_name) # Perform the action. if action == "predict": result = model.predict(input_data, **kwargs) self.assertAllEqual(expected_output, result) if action == "evaluate": result = model.evaluate(input_data, expected_output, **kwargs) self.assertAllEqual(1.0, result[-1]) if action == "fit": # TODO(momernick): What's the best way of validating that fit # happened? _ = model.fit(input_data, expected_output, shuffle=False, **kwargs)
def test_multiple_layers_with_same_shared_embedding_column(self): categorical_column_a = ( tf.feature_column.categorical_column_with_identity(key="aaa", num_buckets=3)) categorical_column_b = ( tf.feature_column.categorical_column_with_identity(key="bbb", num_buckets=3)) embedding_dimension = 2 ( embedding_column_b, embedding_column_a, ) = tf.feature_column.shared_embeddings( [categorical_column_b, categorical_column_a], dimension=embedding_dimension, ) with tf.Graph().as_default(): features = { "aaa": tf.SparseTensor( indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2), ), "bbb": tf.SparseTensor( indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 1), dense_shape=(2, 2), ), } all_cols = [embedding_column_a, embedding_column_b] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that only 1 variable gets created in this case. self.assertEqual( 1, len( tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)), ) self.assertCountEqual( ["aaa_bbb_shared_embedding:0"], [ v.name for v in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES) ], )
def test_sparse_int_input_multi_bucket(self): vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) input_array = tf.SparseTensor( indices=[[0, 0], [1, 2]], values=np.array([13, 133], dtype=np.int64), dense_shape=[3, 4]) expected_indices = [[0, 0], [1, 2]] expected_values = [6, 2] expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True) layer = get_layer_class()( max_values=None, dtype=tf.int64, num_oov_indices=2, mask_value=0, oov_value=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array, steps=1) self.assertAllEqual(expected_indices, output_data.indices) self.assertAllEqual(expected_values, output_data.values) self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
def call(self, inputs): self._maybe_freeze_vocab_size() inputs = self._standardize_inputs(inputs, self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups depth = (self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size) idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None return utils.encode_categorical_inputs(lookups, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, idf_weights=idf_weights)
def test_with_1d_sparse_tensor(self): embedding_values = ( (1.0, 2.0, 3.0, 4.0, 5.0), # id 0 (6.0, 7.0, 8.0, 9.0, 10.0), # id 1 (11.0, 12.0, 13.0, 14.0, 15.0), # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = tf.feature_column.numeric_column("price") # one_hot_body_style has 3 dims in dense_features. body_style = tf.feature_column.categorical_column_with_vocabulary_list( "body-style", vocabulary_list=["hardtop", "wagon", "sedan"]) one_hot_body_style = tf.feature_column.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = tf.feature_column.categorical_column_with_vocabulary_list( "country", vocabulary_list=["US", "JP", "CA"]) embedded_country = tf.feature_column.embedding_column( country, dimension=5, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { "price": tf.constant([ 11.0, 12.0, ]), "body-style": tf.SparseTensor( indices=((0, ), (1, )), values=("sedan", "hardtop"), dense_shape=(2, ), ), # This is dense tensor for the categorical_column. "country": tf.constant(["CA", "US"]), } self.assertEqual(1, features["price"].shape.ndims) self.assertEqual(1, features["body-style"].dense_shape.get_shape()[0]) self.assertEqual(1, features["country"].shape.ndims) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [ [0.0, 0.0, 1.0, 11.0, 12.0, 13.0, 14.0, 15.0, 11.0], [1.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 12.0], ], sess.run(net), )
def test_dense_output(self): dense_inputs = tf.convert_to_tensor( np.random.uniform(size=(10, 10)).astype('f')) # Create some sparse data where multiple rows and columns are missing. sparse_inputs = tf.SparseTensor( indices=np.random.randint(low=0, high=10, size=(5, 2)), values=np.random.uniform(size=(5,)).astype('f'), dense_shape=[10, 10]) sparse_inputs = tf.sparse.reorder(sparse_inputs) layer = keras.layers.Dense( 5, kernel_initializer=keras.initializers.RandomUniform(), bias_initializer=keras.initializers.RandomUniform(), dtype='float32') dense_outputs = layer(dense_inputs) sparse_outpus = layer(sparse_inputs) expected_dense = tf.add( tf.matmul(dense_inputs, keras.backend.get_value(layer.kernel)), keras.backend.get_value(layer.bias)) expected_sparse = tf.add( tf.matmul( tf.sparse.to_dense(sparse_inputs), keras.backend.get_value(layer.kernel)), keras.backend.get_value(layer.bias)) self.assertAllClose(dense_outputs, expected_dense) self.assertAllClose(sparse_outpus, expected_sparse)
def test_sparse_tensor_model_predict(self): # Create a model that accepts a sparse input and runs a "Dense" layer on # it. model_input = input_layer.Input(shape=(3, ), sparse=True, dtype=tf.float32) self.assertEqual([None, 3], model_input.shape.as_list()) layers = [Dense(2)] model = get_model_from_layers_with_input(layers, model_input=model_input) sparse_input = tf.SparseTensor( # A two-row matrix indices=[(0, 0), (0, 1), (0, 2), (5, 0), (5, 1), (5, 2)], values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dense_shape=(6, 3), ) shape = model(sparse_input).shape self.assertEqual((6, 2), self._normalize_shape(shape)) shape = model.predict(sparse_input, steps=1).shape self.assertEqual((6, 2), self._normalize_shape(shape))
def dense_to_sparse(x, ignore_value=None, name=None): """Converts dense `Tensor` to `SparseTensor`, dropping `ignore_value` cells. Args: x: A `Tensor`. ignore_value: Entries in `x` equal to this value will be absent from the return `SparseTensor`. If `None`, default value of `x` dtype will be used (e.g. '' for `str`, 0 for `int`). name: Python `str` prefix for ops created by this function. Returns: sparse_x: A `tf.SparseTensor` with the same shape as `x`. Raises: ValueError: when `x`'s rank is `None`. """ # Copied (with modifications) from: # tensorflow/contrib/layers/python/ops/sparse_ops.py. with tf.name_scope(name or 'dense_to_sparse'): x = tf.convert_to_tensor(x, name='x') if ignore_value is None: if dtype_util.base_dtype(x.dtype) == tf.string: # Exception due to TF strings are converted to numpy objects by default. ignore_value = '' else: ignore_value = dtype_util.as_numpy_dtype(x.dtype)(0) ignore_value = tf.cast(ignore_value, x.dtype, name='ignore_value') indices = tf.where(tf.not_equal(x, ignore_value), name='indices') return tf.SparseTensor(indices=indices, values=tf.gather_nd(x, indices, name='values'), dense_shape=tf.shape(x, out_type=tf.int64, name='dense_shape'))
def dense(inputs, kernel, bias=None, activation=None, dtype=None): """Densely connected NN layer op. Args: inputs: `tf.Tensor` or `tf.SparseTensor`. Inputs to operation. kernel: `tf.Variable`. Matrix kernel. bias: (Optional) `tf.Variable`. Bias to add to outputs. activation: (Optional) 1-argument callable. Activation function to apply to outputs. dtype: (Optional) `tf.DType`. Dtype to cast `inputs` to. Returns: `tf.Tensor`. Output of dense connection. """ if dtype: if inputs.dtype.base_dtype != dtype.base_dtype: inputs = tf.cast(inputs, dtype=dtype) rank = inputs.shape.rank if rank == 2 or rank is None: # We use embedding_lookup_sparse as a more efficient matmul operation for # large sparse input tensors. The op will result in a sparse gradient, as # opposed to sparse_ops.sparse_tensor_dense_matmul which results in dense # gradients. This can lead to sigfinicant speedups, see b/171762937. if isinstance(inputs, tf.SparseTensor): # We need to fill empty rows, as the op assumes at least one id per row. inputs, _ = tf.sparse.fill_empty_rows(inputs, 0) # We need to do some munging of our input to use the embedding lookup as a # matrix multiply. We split our input matrix into separate ids and weights # tensors. The values of the ids tensor should be the column indices of # our input matrix and the values of the weights tensor can continue to # the actual matrix weights. The column arrangement of ids and weights # will be summed over and does not matter. See the documentation for # sparse_ops.sparse_tensor_dense_matmul a more detailed explanation of the # inputs to both ops. ids = tf.SparseTensor( indices=inputs.indices, values=inputs.indices[:, 1], dense_shape=inputs.dense_shape) weights = inputs outputs = tf.nn.embedding_lookup_sparse( kernel, ids, weights, combiner="sum") else: outputs = tf.raw_ops.MatMul(a=inputs, b=kernel) # Broadcast kernel to inputs. else: outputs = tf.tensordot(inputs, kernel, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not tf.executing_eagerly(): shape = inputs.shape.as_list() output_shape = shape[:-1] + [kernel.shape[-1]] outputs.set_shape(output_shape) if bias is not None: outputs = tf.nn.bias_add(outputs, bias) if activation is not None: outputs = activation(outputs) return outputs
def call(self, inputs): bins = [tf.cast(tf.compat.v1.squeeze(self.bins), tf.float32)] def _bucketize_fn(inputs): return tf.raw_ops.BoostedTreesBucketize( float_values=[tf.cast(inputs, tf.float32)], bucket_boundaries=bins)[0] if tf_utils.is_ragged(inputs): integer_buckets = tf.ragged.map_flat_values( _bucketize_fn, inputs) # Ragged map_flat_values doesn't touch the non-values tensors in the # ragged composite tensor. If this op is the only op a Keras model, # this can cause errors in Graph mode, so wrap the tensor in an identity. return tf.identity(integer_buckets) elif isinstance(inputs, tf.SparseTensor): return tf.SparseTensor( indices=tf.identity(inputs.indices), values=_bucketize_fn(inputs.values), dense_shape=tf.identity(inputs.dense_shape)) else: static_shape = inputs.get_shape() if any(dim is None for dim in static_shape.as_list()[1:]): raise NotImplementedError( "Discretization Layer requires known non-batch shape," "found {}".format(static_shape)) dynamic_shape = tf.shape(inputs) # BoostedTreesBucketize only handles rank 1 inputs. We need to flatten our # inputs after batch size and vectorized_map over each sample. reshaped = tf.reshape(inputs, [dynamic_shape[0], -1]) return tf.reshape( tf.vectorized_map(_bucketize_fn, reshaped), dynamic_shape)
def __call__(self, sentences): token_ids, token_values, token_dense_shape = self._tokenize(sentences) return tf.nn.safe_embedding_lookup_sparse( embedding_weights=self.embeddings, sparse_ids=tf.SparseTensor(token_ids, token_values, token_dense_shape), sparse_weights=None, combiner="sqrtn")
def call(self, inputs): inputs = self._preprocess_inputs(inputs) if isinstance(inputs, tf.SparseTensor): return tf.SparseTensor( indices=inputs.indices, values=self._hash_values_to_bins(inputs.values), dense_shape=inputs.dense_shape) return self._hash_values_to_bins(inputs)
def fn(): layer = MyLayer() layer( tf.SparseTensor(indices=[[0, 0]], values=[1], dense_shape=[3, 5]), training=False, )
def test_hash_sparse_int_input_siphash(self): layer = hashing.Hashing(num_bins=3, salt=[133, 137]) indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = tf.SparseTensor( indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2]) output = layer(inp) self.assertAllClose(indices, output.indices) self.assertAllClose([1, 1, 2, 0, 1], output.values)
def call(self, inputs): if isinstance(inputs, (list, tuple, np.ndarray)): inputs = tf.convert_to_tensor(inputs) if isinstance(inputs, tf.SparseTensor): return tf.SparseTensor(indices=inputs.indices, values=self._hash_values_to_bins( inputs.values), dense_shape=inputs.dense_shape) return self._hash_values_to_bins(inputs)
def for_with_composite_tensor_shape_invariant(l): v = tf.SparseTensor(indices=[[0, 0], [1, 1]], values=[1, 2], dense_shape=[3, 3]) for _ in l: tf.autograph.experimental.set_loop_options( shape_invariants=[(v, tf.TensorShape(None))]) v = tf.sparse.expand_dims(v) return v
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = tf.feature_column.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = tf.feature_column.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = tf.feature_column.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = tf.feature_column.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = tf.feature_column.embedding_column( country, dimension=5, initializer=_initializer) with tf.Graph().as_default(): # Provides 1-dim tensor and dense tensor. features = { 'price': tf.constant([ 11., 12., ]), 'body-style': tf.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': tf.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures( [price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def encode_categorical_inputs( inputs, output_mode, depth, dtype="float32", sparse=False, count_weights=None, idf_weights=None, ): """Encodes categoical inputs according to output_mode.""" if output_mode == INT: return tf.identity(tf.cast(inputs, dtype)) original_shape = inputs.shape # In all cases, we should uprank scalar input to a single sample. if inputs.shape.rank == 0: inputs = expand_dims(inputs, -1) # One hot will unprank only if the final output dimension is not already 1. if output_mode == ONE_HOT: if inputs.shape[-1] != 1: inputs = expand_dims(inputs, -1) # TODO(b/190445202): remove output rank restriction. if inputs.shape.rank > 2: raise ValueError( f"When output_mode is not `'int'`, maximum supported output rank " f"is 2. Received output_mode {output_mode} and input shape " f"{original_shape}, " f"which would result in output rank {inputs.shape.rank}.") binary_output = output_mode in (MULTI_HOT, ONE_HOT) if sparse: bincounts = sparse_bincount(inputs, depth, binary_output, dtype, count_weights) else: bincounts = dense_bincount(inputs, depth, binary_output, dtype, count_weights) if output_mode != TF_IDF: return bincounts if idf_weights is None: raise ValueError( f"When output mode is `'tf_idf'`, idf_weights must be provided. " f"Received: output_mode={output_mode} and idf_weights={idf_weights}" ) if sparse: value_weights = tf.gather(idf_weights, bincounts.indices[:, -1]) return tf.SparseTensor( bincounts.indices, value_weights * bincounts.values, bincounts.dense_shape, ) else: return tf.multiply(bincounts, idf_weights)