def test_hash_ragged_string_input_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) inp_data = tf.ragged.constant( [ ["omar", "stringer", "marlo", "wire"], ["marlo", "skywalker", "wire"], ], dtype=tf.string, ) out_data = layer(inp_data) # Same hashed output as test_hash_dense_input_siphash expected_output = [[0, 1, 0, 1], [0, 0, 1]] self.assertAllEqual(expected_output, out_data) inp_t = input_layer.Input(shape=(None, ), ragged=True, dtype=tf.string) out_t = layer(inp_t) model = training.Model(inputs=inp_t, outputs=out_t) self.assertAllClose(out_data, model.predict(inp_data)) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) out_data = layer_2(inp_data) expected_output = [[1, 0, 1, 0], [1, 1, 0]] self.assertAllEqual(expected_output, out_data) out_t = layer_2(inp_t) model = training.Model(inputs=inp_t, outputs=out_t) self.assertAllClose(out_data, model.predict(inp_data))
def test_legacy_dtype_compat(self): inputs = keras.Input(batch_size=16, shape=(4, ), dtype="string") layer = hashing.Hashing(num_bins=3, dtype="float32") outputs = layer(inputs) self.assertAllEqual(outputs.dtype, tf.int64) # In TF1 we sometimes face an explicit dtype=None in the config. layer = hashing.Hashing(num_bins=3, dtype=None) outputs = layer(inputs) self.assertAllEqual(outputs.dtype, tf.int64)
def test_invalid_inputs(self): with self.assertRaisesRegex(ValueError, 'cannot be `None`'): _ = hashing.Hashing(num_bins=None) with self.assertRaisesRegex(ValueError, 'cannot be `None`'): _ = hashing.Hashing(num_bins=-1) with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=2, salt='string') with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=2, salt=[1]) with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=1, salt=tf.constant([133, 137]))
def test_hash_dense_input_mask_value_farmhash(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value="") omar_mask_layer = hashing.Hashing(num_bins=3, mask_value="omar") inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]) empty_mask_output = empty_mask_layer(inp) omar_mask_output = omar_mask_layer(inp) # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth # bin is now reserved for masks). self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output) # 'omar' should map to 0. self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
def test_hash_dense_input_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]) output = layer(inp) # Assert equal for hashed output that should be true on all platforms. # Note the result is different from FarmHash. self.assertAllClose([[0], [1], [0], [1], [0]], output) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) output_2 = layer_2(inp) # Note the result is different from (133, 137). self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
def test_hash_dense_multi_inputs_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]) inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']]) output = layer([inp_1, inp_2]) # Assert equal for hashed output that should be true on all platforms. # Note the result is different from FarmHash. self.assertAllClose([[0], [1], [0], [0], [1]], output) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) output_2 = layer_2([inp_1, inp_2]) # Note the result is different from (133, 137). self.assertAllClose([[1], [1], [1], [0], [1]], output_2)
def test_hash_ragged_input_mask_value(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='') omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar') inp_data = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=tf.string) empty_mask_output = empty_mask_layer(inp_data) omar_mask_output = omar_mask_layer(inp_data) # Outputs should be one more than test_hash_ragged_string_input_farmhash # (the zeroth bin is now reserved for masks). expected_output = [[1, 1, 2, 1], [2, 1, 1]] self.assertAllClose(expected_output, empty_mask_output) # 'omar' should map to 0. expected_output = [[0, 1, 2, 1], [2, 1, 1]] self.assertAllClose(expected_output, omar_mask_output)
def test_hash_dense_multi_inputs_mask_value_farmhash(self): layer = hashing.Hashing(num_bins=3, mask_value='omar') inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]) inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']]) with self.assertRaisesRegex(ValueError, 'not supported yet'): _ = layer([inp_1, inp_2])
def bm_layer_implementation(self, batch_size): input_1 = keras.Input(shape=(None,), dtype=tf.string, name="word") layer = hashing.Hashing(num_bins=2) _ = layer(input_1) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): ds = tf.data.Dataset.from_generator(word_gen, tf.string, tf.TensorShape([])) ds = ds.shuffle(batch_size * 100) ds = ds.batch(batch_size) num_batches = 5 ds = ds.take(num_batches) ds = ds.prefetch(num_batches) starts.append(time.time()) # Benchmarked code begins here. for i in ds: _ = layer(i) # Benchmarked code ends here. ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches name = "hashing|batch_%s" % batch_size baseline = self.run_dataset_implementation(batch_size) extras = { "dataset implementation baseline": baseline, "delta seconds": (baseline - avg_time), "delta percent": ((baseline - avg_time) / baseline) * 100 } self.report_benchmark( iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
def test_hash_sparse_input_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = tf.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'], dense_shape=[3, 2]) output = layer(inp) self.assertAllClose(output.indices, indices) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose([0, 1, 0, 1, 0], output.values) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) output = layer_2(inp) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose([1, 0, 1, 0, 1], output.values)
def test_tensor_like_inputs(self, data_fn): input_data = data_fn([0, 1, 2, 3, 4]) expected_output = [1, 0, 1, 0, 2] layer = hashing.Hashing(num_bins=3) output_data = layer(input_data) self.assertAllEqual(output_data, expected_output)
def test_hash_dense_input_farmhash(self): layer = hashing.Hashing(num_bins=2) inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]) output = layer(inp) # Assert equal for hashed output that should be true on all platforms. self.assertAllClose([[0], [0], [1], [0], [0]], output)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data( max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation model = keras.Sequential() model.add( keras.Input( shape=(max_length,), name="data", ragged=True, dtype=tf.string)) model.add(hashing.Hashing(num_buckets)) # FC implementation fc = tf.feature_column.categorical_column_with_hash_bucket("data", num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_hash_compute_output_signature(self): input_shape = tf.TensorShape([2, 3]) input_spec = tf.TensorSpec(input_shape, tf.string) layer = hashing.Hashing(num_bins=2) output_spec = layer.compute_output_signature(input_spec) self.assertEqual(output_spec.shape.dims, input_shape.dims) self.assertEqual(output_spec.dtype, tf.int64)
def test_hash_dense_multi_inputs_farmhash(self): layer = hashing.Hashing(num_bins=2) inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]) inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']]) output = layer([inp_1, inp_2]) # Assert equal for hashed output that should be true on all platforms. self.assertAllClose([[0], [0], [1], [1], [0]], output)
def test_hash_dense_list_inputs_mixed_int_string_farmhash(self): layer = hashing.Hashing(num_bins=2) inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]) inp_2 = np.asarray([[1], [2], [3], [4], [5]]).astype(np.int64) output = layer([inp_1, inp_2]) # Assert equal for hashed output that should be true on all platforms. self.assertAllClose([[0], [1], [1], [1], [0]], output)
def test_hash_sparse_int_input_siphash(self): layer = hashing.Hashing(num_bins=3, salt=[133, 137]) indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = tf.SparseTensor( indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2]) output = layer(inp) self.assertAllClose(indices, output.indices) self.assertAllClose([1, 1, 2, 0, 1], output.values)
def test_hash_sparse_input_mask_value_farmhash(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='') omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar') indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = tf.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'], dense_shape=[3, 2]) empty_mask_output = empty_mask_layer(inp) omar_mask_output = omar_mask_layer(inp) self.assertAllClose(indices, omar_mask_output.indices) self.assertAllClose(indices, empty_mask_output.indices) # Outputs should be one more than test_hash_sparse_input_farmhash (the # zeroth bin is now reserved for masks). self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values) # 'omar' should map to 0. self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
def test_hash_sparse_multi_inputs_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) indices = [[0, 0], [1, 0], [2, 0]] inp_1 = tf.SparseTensor(indices=indices, values=['omar', 'stringer', 'marlo'], dense_shape=[3, 1]) inp_2 = tf.SparseTensor(indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1]) output = layer([inp_1, inp_2]) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose(indices, output.indices) self.assertAllClose([0, 1, 0], output.values) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) output = layer_2([inp_1, inp_2]) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose([1, 1, 1], output.values)
def test_hash_ragged_string_multi_inputs_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) inp_data_1 = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=tf.string) inp_data_2 = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=tf.string) with self.assertRaisesRegex(ValueError, 'not supported yet'): _ = layer([inp_data_1, inp_data_2])
def test_hash_sparse_input_farmhash(self): layer = hashing.Hashing(num_bins=2) indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = tf.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'], dense_shape=[3, 2]) output = layer(inp) self.assertAllClose(indices, output.indices) self.assertAllClose([0, 0, 1, 0, 0], output.values)
def test_hash_dense_list_input_farmhash(self): layer = hashing.Hashing(num_bins=2) inp = [['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']] output = layer(inp) # Assert equal for hashed output that should be true on all platforms. self.assertAllClose([[0], [0], [1], [0], [0]], output) inp = ['omar', 'stringer', 'marlo', 'wire', 'skywalker'] output = layer(inp) # Assert equal for hashed output that should be true on all platforms. self.assertAllClose([0, 0, 1, 0, 0], output)
def test_hash_sparse_multi_inputs_farmhash(self): layer = hashing.Hashing(num_bins=2) indices = [[0, 0], [1, 0], [2, 0]] inp_1 = tf.SparseTensor(indices=indices, values=['omar', 'stringer', 'marlo'], dense_shape=[3, 1]) inp_2 = tf.SparseTensor(indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1]) output = layer([inp_1, inp_2]) self.assertAllClose(indices, output.indices) self.assertAllClose([0, 0, 1], output.values)
def test_hash_ragged_int_input_siphash(self): layer = hashing.Hashing(num_bins=3, salt=[133, 137]) inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64) out_data = layer(inp_data) # Same hashed output as test_hash_sparse_input_farmhash expected_output = [[1, 1, 0, 1], [2, 1, 1]] self.assertAllEqual(expected_output, out_data) inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64) out_t = layer(inp_t) model = training.Model(inputs=inp_t, outputs=out_t) self.assertAllClose(out_data, model.predict(inp_data))
def test_hash_ragged_string_input_farmhash(self): layer = hashing.Hashing(num_bins=2) inp_data = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=tf.string) out_data = layer(inp_data) # Same hashed output as test_hash_sparse_input_farmhash expected_output = [[0, 0, 1, 0], [1, 0, 0]] self.assertAllEqual(expected_output, out_data) inp_t = input_layer.Input(shape=(None, ), ragged=True, dtype=tf.string) out_t = layer(inp_t) model = training.Model(inputs=inp_t, outputs=out_t) self.assertAllClose(out_data, model.predict(inp_data))
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data_a = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) data_b = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation input_1 = keras.Input(shape=(None, ), name="data_a", dtype=tf.string) input_2 = keras.Input(shape=(None, ), name="data_b", dtype=tf.string) crossed_data = category_crossing.CategoryCrossing()([input_1, input_2]) hashed_data = hashing.Hashing(num_buckets)(crossed_data) model = keras.Model([input_1, input_2], hashed_data) # FC implementation fc = tf.feature_column.crossed_column(["data_a", "data_b"], num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature( tf.__internal__.feature_column.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_multi_hot_output(self): input_array = np.array([0, 1, 2, 3, 4]) expected_output = [1., 1., 1.] expected_output_shape = [None, 3] inputs = keras.Input(shape=(3, ), dtype='int32') layer = hashing.Hashing(num_bins=3, output_mode='multi_hot') outputs = layer(inputs) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) model = keras.Model(inputs, outputs) output_data = model(input_array) self.assertAllEqual(expected_output, output_data)
def test_count_output(self): input_array = np.array([0, 1, 2, 3, 4]) expected_output = [2.0, 2.0, 1.0] expected_output_shape = [None, 3] inputs = keras.Input(shape=(3, ), dtype="int32") layer = hashing.Hashing(num_bins=3, output_mode="count") outputs = layer(inputs) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) model = keras.Model(inputs, outputs) output_data = model(input_array) self.assertAllEqual(expected_output, output_data)
def test_distribution(self, distribution): input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]]) input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch( 2, drop_remainder=True) expected_output = [[0], [0], [1], [0]] tf.config.set_soft_device_placement(True) with distribution.scope(): input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = hashing.Hashing(num_bins=2) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_dataset) self.assertAllEqual(expected_output, output_dataset)
def test_strategy(self, strategy): if (backend.is_tpu_strategy(strategy) and not tf_test_utils.is_mlir_bridge_enabled()): self.skipTest("TPU tests require MLIR bridge") input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]]) input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch( 2, drop_remainder=True) expected_output = [[0], [0], [1], [0]] tf.config.set_soft_device_placement(True) with strategy.scope(): input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = hashing.Hashing(num_bins=2) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_dataset) self.assertAllEqual(expected_output, output_dataset)