def test_forward_backward_explicit_vocab(self): vocab_data = [42, 1138, 725, 1729] input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]]) input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_data) inverse_layer = integer_lookup.IntegerLookup(vocabulary=vocab_data, invert=True) int_data = layer(input_data) inverse_data = inverse_layer(int_data) model = keras.Model(inputs=input_data, outputs=inverse_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_non_unique_vocab_from_file_fails(self): vocab_list = [42, 1138, 725, 1729, 42] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex( tf.errors.FailedPreconditionError, ".*HashTable has different value for same key.*42.*"): _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
def test_layer_with_list_input(self): vocab = [12, 36, 1138, 42] data = [[12, 1138, 42], [42, 1000, 36]] # Note OOV tokens layer = integer_lookup.IntegerLookup(vocabulary=vocab) output = layer(data) expected_output = np.array([[1, 3, 4], [4, 0, 2]]) self.assertEqual(output.numpy().tolist(), expected_output.tolist())
def test_get_vocab_returns_int(self): vocab_data = [42, 1138, 725, 1729] expected_vocab = [-1, 42, 1138, 725, 1729] layer = integer_lookup.IntegerLookup(vocabulary=vocab_data) layer_vocab = layer.get_vocabulary() self.assertAllEqual(expected_vocab, layer_vocab) self.assertIsInstance(layer_vocab[0], np.int64)
def test_too_long_vocab_fails_in_single_setting(self): vocab_data = [42, 1138, 725, 1729] layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1) with self.assertRaisesRegex( ValueError, "vocabulary larger than the maximum vocab.*"): layer.set_vocabulary(vocab_data)
def test_sparse_int_input_multi_bucket(self): vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) input_array = tf.SparseTensor( indices=[[0, 0], [1, 2]], values=np.array([13, 133], dtype=np.int64), dense_shape=[3, 4], ) expected_indices = [[0, 0], [1, 2]] expected_values = [6, 2] expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None, ), dtype=tf.int64, sparse=True) layer = integer_lookup.IntegerLookup( max_tokens=None, dtype=tf.int64, num_oov_indices=2, mask_token=0, oov_token=-1, ) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array, steps=1) self.assertAllEqual(expected_indices, output_data.indices) self.assertAllEqual(expected_values, output_data.values) self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
def test_vocabulary_persistence_across_saving(self): vocab_data = [42, 1138, 725, 1729] input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] # Build and validate a golden model. input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(output_dataset, expected_output) # Save the model to disk. output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model") model.save(output_path, save_format="tf") # Delete the session and graph to ensure that the loaded model is generated # from scratch. # TODO(b/149526183): Can't clear session when TF2 is disabled. if tf.__internal__.tf2.enabled(): keras.backend.clear_session() loaded_model = keras.models.load_model( output_path, custom_objects={"IntegerLookup": integer_lookup.IntegerLookup}) # Ensure that the loaded model is unique (so that the save/load is real) self.assertIsNot(model, loaded_model) # Validate correctness of the new model. new_output_dataset = loaded_model.predict(input_array) self.assertAllEqual(new_output_dataset, expected_output)
def test_ragged_adapt(self): vocab_data = tf.ragged.constant([[203], [1729, 203]]) vocab_dataset = tf.data.Dataset.from_tensors(vocab_data) layer = integer_lookup.IntegerLookup() layer.adapt(vocab_dataset) expected_vocabulary = [-1, 203, 1729] self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_sparse_output(self): vocab_data = [2, 3, 4, 5] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_data, output_mode="multi_hot", sparse=True) res = layer(input_data) self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
def test_sparse_adapt(self): vocab_data = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 2]], values=[203, 1729, 203], dense_shape=[3, 4]) vocab_dataset = tf.data.Dataset.from_tensors(vocab_data) layer = integer_lookup.IntegerLookup() layer.adapt(vocab_dataset) expected_vocabulary = [-1, 203, 1729] self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_tensor_vocab(self): vocab_data = [-1, 42, 1138, 725, 1729] vocab_tensor = tf.constant(vocab_data, tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_tensor) returned_vocab = layer.get_vocabulary() self.assertAllEqual(vocab_data, returned_vocab) self.assertAllEqual(layer.vocabulary_size(), 5) fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor)) with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"): fn()
def test_int_output_explicit_vocab(self): vocab_data = [42, 1138, 725, 1729] input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_int_output_with_mask(self): vocab_data = [42, 1138, 725, 1729] input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=0) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_count_output(self): vocab_data = [2, 3, 4, 5] input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 6]]) expected_output = [[0, 2, 1, 1, 0], [3, 0, 0, 0, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_data, output_mode="count") res = layer(input_data) model = keras.Model(inputs=input_data, outputs=res) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_inverse_output(self): vocab_data = [0, -1, 42, 1138, 725, 1729] input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]]) expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]]) input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(invert=True) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_int_output_explicit_vocab_from_file(self): vocab_list = [42, 1138, 725, 1729] vocab_path = self._write_to_temp_file("vocab_file", vocab_list) input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_path) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_ragged_int_input(self): vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]], dtype=np.int64) expected_output = [[1, 2, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None, ), dtype=tf.int64, ragged=True) layer = integer_lookup.IntegerLookup(max_tokens=None) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_ragged_int_input_multi_bucket(self): vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 133]], dtype=np.int64) expected_output = [[3, 4, 6], [6, 5, 3, 2]] input_data = keras.Input(shape=(None, ), dtype=tf.int64, ragged=True) layer = integer_lookup.IntegerLookup(max_values=None, num_oov_indices=2) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset)
def test_single_int_generator_dataset(self): def word_gen(): for _ in itertools.count(1): yield random.randint(0, 100) ds = tf.data.Dataset.from_generator(word_gen, tf.int64, tf.TensorShape([])) batched_ds = ds.take(2) input_t = keras.Input(shape=(), dtype=tf.int64) layer = integer_lookup.IntegerLookup(max_tokens=10, num_oov_indices=0, mask_token=None, oov_token=None) _ = layer(input_t) layer.adapt(batched_ds)
def test_int_output_no_oov(self): vocab_data = [42, 1138, 725, 1729] valid_input = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 0]]) invalid_input = np.array([[42, 1138, 725, 203], [1729, 725, 42, 203]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(vocabulary=vocab_data, mask_token=0, num_oov_indices=0) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(valid_input) self.assertAllEqual(expected_output, output_data) with self.assertRaisesRegex(tf.errors.InvalidArgumentError, "found OOV values.*203"): _ = model.predict(invalid_input)
def test_zero_max_tokens_fails(self): with self.assertRaisesRegex(ValueError, ".*max_tokens.*"): _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
def test_no_vocab(self): with self.assertRaisesRegex(ValueError, "You must set the layer's vocabulary"): layer = integer_lookup.IntegerLookup() layer([[1]])
def test_output_shape(self): input_data = keras.Input(shape=(4, ), dtype=tf.int64) layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1) int_data = layer(input_data) self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
def test_non_unique_vocab_fails(self): vocab_data = [42, 1138, 725, 1729, 1729] with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"): _ = integer_lookup.IntegerLookup(vocabulary=vocab_data)
def test_non_unique_vocab_from_file_fails(self): vocab_list = [42, 1138, 725, 1729, 42] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"): _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
def test_no_vocab(self): with self.assertRaisesRegex(RuntimeError, "you must set the layer's vocabulary"): layer = integer_lookup.IntegerLookup(output_mode="binary") layer([[1]])