def test_dense_input_sparse_output(self): input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]]) # The expected output should be (X for missing value): # [[X, 1, 1, 1, X, X] # [1, X, X, 2, X, X]] expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]] expected_values = [1, 1, 1, 1, 2] num_tokens = 6 input_data = keras.Input(shape=(None, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) sp_output_dataset = model.predict(input_array, steps=1) self.assertAllEqual(expected_values, sp_output_dataset.values) self.assertAllEqual(expected_indices, sp_output_dataset.indices) # Assert sparse output is same as dense output. layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=False) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array, steps=1) self.assertAllEqual( sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0), output_dataset)
def test_saving_loading(self): encoder = category_encoding.CategoryEncoding() encoder.adapt([1, 2, 3]) model = keras.Sequential([encoder]) model.save("/tmp/model", save_format="tf") loaded_model = keras.models.load_model("/tmp/model") self.assertAllClose(model.predict([[1]]), loaded_model.predict([[1]]))
def run_dataset_implementation(self, output_mode, batch_size, sequence_length, max_tokens): input_t = keras.Input(shape=(sequence_length, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding(max_tokens=max_tokens, output_mode=output_mode) _ = layer(input_t) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): ds = dataset_ops.Dataset.from_tensor_slices( random_ops.random_uniform([batch_size * 10, sequence_length], minval=0, maxval=max_tokens - 1, dtype=dtypes.int32)) ds = ds.shuffle(batch_size * 100) ds = ds.batch(batch_size) num_batches = 5 ds = ds.take(num_batches) ds = ds.prefetch(num_batches) starts.append(time.time()) # Benchmarked code begins here. for i in ds: _ = layer(i) # Benchmarked code ends here. ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % ( batch_size, sequence_length, max_tokens) self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
def test_multi_hot_rank_3_output_fails(self): layer = category_encoding.CategoryEncoding( num_tokens=4, output_mode=category_encoding.ONE_HOT) with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"): _ = layer(keras.Input(shape=(3, 4,), dtype=dtypes.int32)) with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"): _ = layer(np.array([[[3, 2, 0, 1], [3, 2, 0, 1]]]))
def test_sparse_input_sparse_output_with_weights(self): indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]] sp_inp = sparse_tensor.SparseTensor(indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2]) input_data = keras.Input(shape=(None, ), dtype=dtypes.int64, sparse=True) sp_weight = sparse_tensor.SparseTensor(indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2]) weight_data = keras.Input(shape=(None, ), dtype=dtypes.float32, sparse=True) # The expected output should be (X for missing value): # [[1, X, X, X] # [X, X, 1, X] # [X, 2, X, X] # [1, X, X, X]] expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]] expected_values = [.1, .2, .7, .2] num_tokens = 6 layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True) int_data = layer(input_data, count_weights=weight_data) model = keras.Model(inputs=[input_data, weight_data], outputs=int_data) sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1) self.assertAllClose(expected_values, sp_output_dataset.values) self.assertAllEqual(expected_indices, sp_output_dataset.indices)
def test_sparse_input_with_weights(self): input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64) weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]]) sparse_tensor_data = sparse_ops.from_dense(input_array) sparse_weight_data = sparse_ops.from_dense(weights_array) # pyformat: disable expected_output = [[0, .1, .2, .3, .4, 0], [0, .4, 0, .1, .5, 0]] # pyformat: enable num_tokens = 6 expected_output_shape = [None, num_tokens] input_data = keras.Input(shape=(None, ), dtype=dtypes.int64, sparse=True) weight_data = keras.Input(shape=(None, ), dtype=dtypes.float32, sparse=True) layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT) int_data = layer(input_data, count_weights=weight_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=[input_data, weight_data], outputs=int_data) output_dataset = model.predict( [sparse_tensor_data, sparse_weight_data], steps=1) self.assertAllClose(expected_output, output_dataset)
def test_dense_negative(self): input_array = constant_op.constant([[1, 2, 0], [2, 2, -1]]) num_tokens = 3 expected_output_shape = [None, num_tokens] encoder_layer = category_encoding.CategoryEncoding(num_tokens) input_data = keras.Input(shape=(3, ), dtype=dtypes.int32) int_data = encoder_layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) with self.assertRaisesRegex( errors.InvalidArgumentError, ".*must be in the range 0 <= values < num_tokens.*"): _ = model.predict(input_array, steps=1)
def test_sparse_output_and_dense_layer(self): input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]]) num_tokens = 4 input_data = keras.Input(shape=(None,), dtype=dtypes.int32) encoding_layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True) int_data = encoding_layer(input_data) dense_layer = keras.layers.Dense(units=1) output_data = dense_layer(int_data) model = keras.Model(inputs=input_data, outputs=output_data) _ = model.predict(input_array, steps=1)
def test_legacy_max_tokens_arg(self): input_array = np.array([[1, 2, 3, 1]]) expected_output = [[0, 1, 1, 1, 0, 0]] num_tokens = 6 expected_output_shape = [None, num_tokens] input_data = keras.Input(shape=(None, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding( max_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT) int_data = layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_dense_oov_input(self): valid_array = constant_op.constant([[0, 1, 2], [0, 1, 2]]) invalid_array = constant_op.constant([[0, 1, 2], [2, 3, 1]]) num_tokens = 3 expected_output_shape = [None, num_tokens] encoder_layer = category_encoding.CategoryEncoding(num_tokens) input_data = keras.Input(shape=(3, ), dtype=dtypes.int32) int_data = encoder_layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) # Call predict once on valid input to compile a graph and test control flow. _ = model.predict(valid_array, steps=1) with self.assertRaisesRegex( errors.InvalidArgumentError, ".*must be in the range 0 <= values < num_tokens.*"): _ = model.predict(invalid_array, steps=1)
def test_end_to_end_bagged_modeling(self, output_mode, num_tokens): input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]]) input_data = keras.Input(shape=(None, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding(num_tokens=num_tokens, output_mode=output_mode) weights = [] if num_tokens is None: layer.set_num_elements(5) layer.set_weights(weights) int_data = layer(input_data) float_data = backend.cast(int_data, dtype="float32") output_data = core.Dense(64)(float_data) model = keras.Model(inputs=input_data, outputs=output_data) _ = model.predict(input_array)
def test_multi_hot_output(self): input_data = np.array([[1, 2, 3, 1], [0, 3, 1, 0]]) expected_output = [ [0, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0], ] num_tokens = 6 expected_output_shape = [None, num_tokens] layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT) inputs = keras.Input(shape=(None,), dtype=dtypes.int32) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) output_data = model.predict(input_data) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) self.assertAllEqual(expected_output, output_data)
def test_count_output(self): input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]]) # pyformat: disable expected_output = [[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]] # pyformat: enable num_tokens = 6 expected_output_shape = [None, num_tokens] input_data = keras.Input(shape=(None, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding( num_tokens=6, output_mode=category_encoding.COUNT) int_data = layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab_size = 32768 vocab = fc_bm.create_vocabulary(vocab_size) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None)) model.add( category_encoding.CategoryEncoding(num_tokens=vocab_size + 1, output_mode="count")) # FC implementation fc = fcv2.indicator_column( fcv2.categorical_column_with_vocabulary_list(key="data", vocabulary_list=vocab, num_oov_buckets=1)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_multi_hot_output_rank_zero_input(self): input_data = np.array(3) expected_output = [0, 0, 0, 1, 0, 0] num_tokens = 6 expected_output_shape = [None, num_tokens] # Test call on layer directly. layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT) output_data = layer(input_data) self.assertAllEqual(expected_output, output_data) # Test call on model. inputs = keras.Input(shape=(4,), dtype=dtypes.int32) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) output_data = model(input_data) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) self.assertAllEqual(expected_output, output_data)
def test_one_hot_output(self): input_data = np.array([[3], [2], [0], [1]]) expected_output = [ [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], ] num_tokens = 4 expected_output_shape = [None, num_tokens] layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT) inputs = keras.Input(shape=(1,), dtype=dtypes.int32) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) output_dataset = model(input_data) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) self.assertAllEqual(expected_output, output_dataset)
def test_distribution(self, distribution): input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]]) inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array) inp_dataset = batch_wrapper(inp_dataset, 2, distribution) # pyformat: disable expected_output = [[0, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0]] # pyformat: enable max_tokens = 6 config.set_soft_device_placement(True) with distribution.scope(): input_data = keras.Input(shape=(4, ), dtype=dtypes.int32) layer = category_encoding.CategoryEncoding( max_tokens=max_tokens, output_mode=category_encoding.BINARY) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(inp_dataset) self.assertAllEqual(expected_output, output_dataset)
def test_sparse_input(self): input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64) sparse_tensor_data = sparse_ops.from_dense(input_array) # pyformat: disable expected_output = [[0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 0, 0]] # pyformat: enable num_tokens = 6 expected_output_shape = [None, num_tokens] input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT) int_data = layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(sparse_tensor_data, steps=1) self.assertAllEqual(expected_output, output_dataset)
def test_ragged_input(self): input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]]) # pyformat: disable expected_output = [[0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 0, 0]] # pyformat: enable num_tokens = 6 expected_output_shape = [None, num_tokens] input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True) layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT) int_data = layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array, steps=1) self.assertAllEqual(expected_output, output_dataset)
def test_one_hot_output_rank_one_input(self): input_data = np.array([3, 2, 0, 1]) expected_output = [ [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], ] num_tokens = 4 expected_output_shape = [None, num_tokens] # Test call on layer directly. layer = category_encoding.CategoryEncoding( num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT) output_data = layer(input_data) self.assertAllEqual(expected_output, output_data) # Test call on model. inputs = keras.Input(shape=(1,), dtype=dtypes.int32) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) output_data = model(input_data) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) self.assertAllEqual(expected_output, output_data)
def test_serialize(self): encoder = category_encoding.CategoryEncoding() encoder.adapt([1, 2, 3]) model = keras.Sequential([encoder]) _ = keras.models.clone_model(model)