def test_legacy_dtype_compat(self): inputs = keras.Input(batch_size=16, shape=(4, ), dtype="float32") layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5], dtype="float32") outputs = layer(inputs) self.assertAllEqual(outputs.dtype, tf.int64) # In TF1 we sometimes face an explicit dtype=None in the config. layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5], dtype=None) outputs = layer(inputs) self.assertAllEqual(outputs.dtype, tf.int64)
def bm_adapt_implementation(self, num_elements, batch_size): """Test the KPL adapt implementation.""" input_t = keras.Input(shape=(1, ), dtype=tf.float32) layer = discretization.Discretization() _ = layer(input_t) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): ds = tf.data.Dataset.range(num_elements) ds = ds.map(lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1)) ds = ds.batch(batch_size) starts.append(time.time()) # Benchmarked code begins here. layer.adapt(ds) # Benchmarked code ends here. ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) name = "discretization_adapt|%s_elements|batch_%s" % ( num_elements, batch_size, ) baseline = self.run_dataset_implementation(num_elements, batch_size) extras = { "tf.data implementation baseline": baseline, "delta seconds": (baseline - avg_time), "delta percent": ((baseline - avg_time) / baseline) * 100, } self.report_benchmark(iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
def test_layer_computation(self, adapt_data, test_data, use_dataset, expected, num_bins=5, epsilon=0.01): input_shape = tuple(list(test_data.shape)[1:]) np.random.shuffle(adapt_data) if use_dataset: # Keras APIs expect batched datasets adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch( test_data.shape[0] // 2) test_data = tf.data.Dataset.from_tensor_slices(test_data).batch( test_data.shape[0] // 2) layer = discretization.Discretization(epsilon=epsilon, num_bins=num_bins) layer.adapt(adapt_data) input_data = keras.Input(shape=input_shape) output = layer(input_data) model = keras.Model(input_data, output) model._run_eagerly = test_utils.should_run_eagerly() output_data = model.predict(test_data) self.assertAllClose(expected, output_data)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. max_value = 25.0 bins = np.arange(1.0, max_value) data = fc_bm.create_data( max_length, batch_size * NUM_REPEATS, 100000, dtype=float ) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.float32)) model.add(discretization.Discretization(bins)) # FC implementation fc = tf.feature_column.bucketized_column( tf.feature_column.numeric_column("data"), boundaries=list(bins) ) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature( tf.__internal__.feature_column.FeatureTransformationCache(tensors), None, ) # Benchmark runs keras_data = {"data": data.to_tensor(default_value=0.0)} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_tensor(default_value=0.0)} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def run_dataset_implementation(self, num_elements, batch_size): input_t = keras.Input(shape=(1, )) layer = discretization.Discretization() _ = layer(input_t) num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): ds = tf.data.Dataset.range(num_elements) ds = ds.map( lambda x: tf.compat.v1.expand_dims(tf.cast(x, tf.float32), -1)) ds = ds.batch(batch_size) starts.append(time.time()) # Benchmarked code begins here. state = ds.reduce((np.zeros((1, 2)), ), reduce_fn) bins = discretization.get_bucket_boundaries(state, 100) layer.set_weights([bins]) # Benchmarked code ends here. ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) return avg_time
def test_saved_model_tf(self): input_data = [[1], [2], [3]] predict_data = [[0.5], [1.5], [2.5]] expected_output = [[0], [1], [2]] inputs = keras.Input(shape=(1, ), dtype=tf.float32) layer = discretization.Discretization(num_bins=3) layer.adapt(input_data) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) output_data = model.predict(predict_data) self.assertAllClose(output_data, expected_output) # Save the model to disk. output_path = os.path.join(self.get_temp_dir(), "tf_saved_model") tf.saved_model.save(model, output_path) loaded_model = tf.saved_model.load(output_path) f = loaded_model.signatures["serving_default"] # Ensure that the loaded model is unique (so that the save/load is real) self.assertIsNot(model, loaded_model) # Validate correctness of the new model. new_output_data = f(tf.constant(predict_data))["discretization"] self.assertAllClose(new_output_data, expected_output)
def test_one_hot_output_dtype(self, dtype): inputs = keras.Input(batch_size=16, shape=(1, ), dtype="float32") layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5], output_mode="one_hot", dtype=dtype) outputs = layer(inputs) self.assertAllEqual(outputs.dtype, dtype)
def test_bucketize_with_explicit_buckets_sparse_float_input(self): indices = [[0, 1], [0, 2], [1, 1]] input_array = tf.SparseTensor( indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3]) expected_output = [0, 2, 3] input_data = keras.Input(shape=(3,), dtype=tf.float32, sparse=True) layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5]) bucket_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array, steps=1) self.assertAllEqual(indices, output_dataset.indices) self.assertAllEqual(expected_output, output_dataset.values)
def test_bucketize_with_explicit_buckets_integer(self): input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]]) expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]] expected_output_shape = [None, 4] input_data = keras.Input(shape=(4, )) layer = discretization.Discretization(bin_boundaries=[0., 1., 2.]) bucket_data = layer(input_data) self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_merge_state(self): data = np.arange(300) partial_ds_1 = tf.data.Dataset.from_tensor_slices(data[:100]) partial_ds_2 = tf.data.Dataset.from_tensor_slices(data[100:200]) partial_ds_3 = tf.data.Dataset.from_tensor_slices(data[200:]) full_ds = partial_ds_1.concatenate(partial_ds_2).concatenate(partial_ds_3) # Use a higher epsilon to avoid any discrepencies from the quantile # approximation. full_layer = discretization.Discretization(num_bins=3, epsilon=0.001) full_layer.adapt(full_ds.batch(2)) partial_layer_1 = discretization.Discretization(num_bins=3, epsilon=0.001) partial_layer_1.adapt(partial_ds_1.batch(2)) partial_layer_2 = discretization.Discretization(num_bins=3, epsilon=0.001) partial_layer_2.adapt(partial_ds_2.batch(2)) partial_layer_3 = discretization.Discretization(num_bins=3, epsilon=0.001) partial_layer_3.adapt(partial_ds_3.batch(2)) partial_layer_1.merge_state([partial_layer_2, partial_layer_3]) merged_layer = partial_layer_1 data = np.arange(300) self.assertAllClose(full_layer(data), merged_layer(data))
def test_bucketize_with_explicit_buckets_int_input(self): input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64) expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]] expected_output_shape = [None, 4] input_data = keras.Input(shape=(4,), dtype=tf.int64) layer = discretization.Discretization(bins=[-.5, 0.5, 1.5]) bucket_data = layer(input_data) self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_bucketize_with_explicit_buckets_ragged_int_input(self): input_array = tf.ragged.constant([[-1, 1, 3, 0], [0, 3, 1]], dtype=tf.int64) expected_output = [[0, 2, 3, 1], [1, 3, 2]] expected_output_shape = [None, None] input_data = keras.Input(shape=(None, ), ragged=True, dtype=tf.int64) layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5]) bucket_data = layer(input_data) self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_bucketize_with_explicit_buckets_ragged_float_input(self): input_array = tf.ragged.constant([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3]]) expected_output = [[0, 1, 3, 1], [0, 3, 2]] expected_output_shape = [None, None] input_data = keras.Input(shape=(None,), ragged=True) layer = discretization.Discretization(bins=[0., 1., 2.]) bucket_data = layer(input_data) self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_count_output(self): input_data = np.array([-1.5, 1.0, 3.4, 3.5]) expected_output = [1., 0., 1., 2.] expected_output_shape = [None, 4] inputs = keras.Input(shape=(4, )) layer = discretization.Discretization(bin_boundaries=[0., 1., 2.], output_mode="count") outputs = layer(inputs) self.assertAllEqual(expected_output_shape, outputs.shape.as_list()) model = keras.Model(inputs, outputs) output_data = model(input_data) self.assertAllEqual(expected_output, output_data)
def test_distribution(self, distribution): input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]]) expected_output = [[0, 1, 3, 1], [0, 3, 2, 0]] expected_output_shape = [None, 4] tf.config.set_soft_device_placement(True) with distribution.scope(): input_data = keras.Input(shape=(4,)) layer = discretization.Discretization(bins=[0., 1., 2.]) bucket_data = layer(input_data) self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list()) model = keras.Model(inputs=input_data, outputs=bucket_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_multiple_adapts(self): first_adapt = [[1], [2], [3]] second_adapt = [[4], [5], [6]] predict_input = [[2], [2]] expected_first_output = [[2], [2]] expected_second_output = [[0], [0]] inputs = keras.Input(shape=(1, ), dtype=tf.int32) layer = discretization.Discretization(num_bins=3) layer.adapt(first_adapt) outputs = layer(inputs) model = keras.Model(inputs=inputs, outputs=outputs) actual_output = model.predict(predict_input) self.assertAllClose(actual_output, expected_first_output) # Re-adapt the layer on new inputs. layer.adapt(second_adapt) # Re-compile the model. model.compile() # `predict` should now use the new model state. actual_output = model.predict(predict_input) self.assertAllClose(actual_output, expected_second_output)
def test_output_dtype(self): input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.int64) layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5]) output = layer(input_data) self.assertAllEqual(output.dtype, tf.int64)
def test_merge_with_unadapted_layers_fails(self): layer1 = discretization.Discretization(num_bins=2, name="layer1") layer1.adapt([1, 2, 3]) layer2 = discretization.Discretization(num_bins=2, name="layer2") with self.assertRaisesRegex(ValueError, "Cannot merge.*layer2"): layer1.merge_state([layer2])
def test_output_shape(self): inputs = keras.Input(batch_size=16, shape=(4, ), dtype=tf.int64) layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5]) outputs = layer(inputs) self.assertAllEqual(outputs.shape.as_list(), [16, 4])
def test_num_bins_and_bins_set_fails(self): with self.assertRaisesRegex( ValueError, r"`num_bins` and `bin_boundaries` should not be set.*5.*\[1, 2\]" ): _ = discretization.Discretization(num_bins=5, bins=[1, 2])
def test_num_bins_negative_fails(self): with self.assertRaisesRegex(ValueError, "`num_bins` must be.*num_bins=-7"): _ = discretization.Discretization(num_bins=-7)