def define_kpls_for_training(self, use_adapt): # Define KPLs under strategy's scope. Right now, if they have look up # tables, they will be created on the client. Their variables will be # created on PS. Ideally they should be cached on each worker since they # will not be changed in a training step. if use_adapt: feature_lookup_layer = string_lookup.StringLookup( num_oov_indices=1) feature_lookup_layer.adapt(FEATURE_VOCAB) label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0, mask_token=None) label_lookup_layer.adapt(LABEL_VOCAB) else: feature_lookup_layer = string_lookup.StringLookup( vocabulary=FEATURE_VOCAB, num_oov_indices=1) label_lookup_layer = string_lookup.StringLookup( vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None) raw_feature_input = keras.layers.Input(shape=(3, ), dtype=tf.string, name="feature", ragged=True) feature_id_input = feature_lookup_layer(raw_feature_input) # Model creates variables as well. feature_ps = keras.Model({"features": raw_feature_input}, feature_id_input) raw_label_input = keras.layers.Input(shape=(1, ), dtype=tf.string, name="label") label_id_input = label_lookup_layer(raw_label_input) label_ps = keras.Model({"label": raw_label_input}, label_id_input) return feature_ps, label_ps
def test_get_vocab_returns_str(self): vocab_data = ["earth", "wind", "and", "fire"] expected_vocab = ["", "[UNK]", "earth", "wind", "and", "fire"] layer = string_lookup.StringLookup(vocabulary=vocab_data) layer_vocab = layer.get_vocabulary() self.assertAllEqual(expected_vocab, layer_vocab) self.assertIsInstance(layer_vocab[0], str) inverse_layer = string_lookup.StringLookup( vocabulary=layer.get_vocabulary(), invert=True) layer_vocab = inverse_layer.get_vocabulary() self.assertAllEqual(expected_vocab, layer_vocab) self.assertIsInstance(layer_vocab[0], str)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data( max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add( keras.Input( shape=(max_length,), name="data", ragged=True, dtype=tf.string)) model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None)) # FC implementation fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list( key="data", vocabulary_list=vocab, num_oov_buckets=1) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_forward_backward_explicit_vocab(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "[UNK]"]]) input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = string_lookup.StringLookup(vocabulary=vocab_data) invert_layer = string_lookup.StringLookup(vocabulary=vocab_data, invert=True) int_data = layer(input_data) out_data = invert_layer(int_data) model = keras.Model(inputs=input_data, outputs=out_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_non_unique_vocab_from_file_fails(self): vocab_list = ["earth", "wind", "and", "fire", "earth"] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex( tf.errors.FailedPreconditionError, "HashTable has different value for same key.*earth"): _ = string_lookup.StringLookup(vocabulary=vocab_path)
def test_sparse_output(self): vocab_data = ["earth", "wind", "and", "fire"] input_data = keras.Input(shape=(None,), dtype=tf.string) layer = string_lookup.StringLookup( vocabulary=vocab_data, output_mode="multi_hot", sparse=True) res = layer(input_data) self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
def define_reverse_lookup_layer(self): # Only needed for serving. label_inverse_lookup_layer = string_lookup.StringLookup( num_oov_indices=0, mask_token=None, vocabulary=LABEL_VOCAB, invert=True) return label_inverse_lookup_layer
def define_reverse_lookup_layer(self): """Create string reverse lookup layer for serving.""" label_inverse_lookup_layer = string_lookup.StringLookup( num_oov_indices=0, mask_token=None, vocabulary=self.LABEL_VOCAB, invert=True) return label_inverse_lookup_layer
def test_tensor_vocab(self): vocab_data = ["[UNK]", "wind", "and", "fire"] vocab_tensor = tf.constant(vocab_data) layer = string_lookup.StringLookup(vocabulary=vocab_tensor) returned_vocab = layer.get_vocabulary() self.assertAllEqual(vocab_data, returned_vocab) self.assertAllEqual(layer.vocabulary_size(), 4) fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor)) with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"): fn()
def dataset_fn(input_context): del input_context lookup_layer = string_lookup.StringLookup( num_oov_indices=1, vocabulary=filepath) x = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) y = np.array([0, 1]) map_fn = lambda x, y: (lookup_layer(x), y) return tf.data.Dataset.from_tensor_slices( (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
def define_kpls_for_training(self, use_adapt): """Function that defines KPL used for unit tests of tf.distribute. Args: use_adapt: if adapt will be called. False means there will be precomputed statistics. Returns: feature_mapper: a simple keras model with one keras StringLookup layer which maps feature to index. label_mapper: similar to feature_mapper, but maps label to index. """ if use_adapt: feature_lookup_layer = (string_lookup.StringLookup( num_oov_indices=1)) feature_lookup_layer.adapt(self.FEATURE_VOCAB) label_lookup_layer = (string_lookup.StringLookup(num_oov_indices=0, mask_token=None)) label_lookup_layer.adapt(self.LABEL_VOCAB) else: feature_lookup_layer = (string_lookup.StringLookup( vocabulary=self.FEATURE_VOCAB, num_oov_indices=1)) label_lookup_layer = (string_lookup.StringLookup( vocabulary=self.LABEL_VOCAB, num_oov_indices=0, mask_token=None)) raw_feature_input = keras.layers.Input(shape=(3, ), dtype=tf.string, name="feature", ragged=True) feature_id_input = feature_lookup_layer(raw_feature_input) feature_mapper = keras.Model({"features": raw_feature_input}, feature_id_input) raw_label_input = keras.layers.Input(shape=(1, ), dtype=tf.string, name="label") label_id_input = label_lookup_layer(raw_label_input) label_mapper = keras.Model({"label": raw_label_input}, label_id_input) return feature_mapper, label_mapper
def test_inverse_layer(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]) input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = string_lookup.StringLookup(vocabulary=vocab_data, invert=True) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_explicit_vocab_with_special_tokens(self): vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = string_lookup.StringLookup(vocabulary=vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_inverse_layer_from_file(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "[UNK]"]]) vocab_path = self._write_to_temp_file("vocab_file", vocab_data) input_data = keras.Input(shape=(None, ), dtype=tf.int64) layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_count_output(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "earth", "fire", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = string_lookup.StringLookup(vocabulary=vocab_data, output_mode="count") res = layer(input_data) model = keras.Model(inputs=input_data, outputs=res) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_ragged_string_input_multi_bucket(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = tf.ragged.constant([["earth", "wind", "fire"], ["fire", "and", "earth", "ohio"]]) expected_output = [[3, 4, 6], [6, 5, 3, 2]] input_data = keras.Input(shape=(None, ), dtype=tf.string, ragged=True) layer = string_lookup.StringLookup(num_oov_indices=2) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_explicit_vocab_from_file(self): vocab_list = ["earth", "wind", "and", "fire"] vocab_path = self._write_to_temp_file("vocab_file", vocab_list) input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None, ), dtype=tf.string) layer = string_lookup.StringLookup(vocabulary=vocab_path) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_no_oov(self): vocab_data = ["earth", "wind", "and", "fire"] valid_input = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]) invalid_input = np.array([["earth", "wind", "and", "michigan"], ["fire", "and", "earth", "michigan"]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None,), dtype=tf.string) layer = string_lookup.StringLookup( vocabulary=vocab_data, mask_token="", num_oov_indices=0) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(valid_input) self.assertAllEqual(expected_output, output_data) with self.assertRaisesRegex(tf.errors.InvalidArgumentError, "found OOV values.*michigan"): _ = model.predict(invalid_input)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab_size = 32768 vocab = fc_bm.create_vocabulary(vocab_size) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=tf.string)) model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None)) model.add( category_encoding.CategoryEncoding(num_tokens=vocab_size + 1, output_mode="count")) # FC implementation fc = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( key="data", vocabulary_list=vocab, num_oov_buckets=1)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_non_unique_vocab_from_file_fails(self): vocab_list = ["earth", "wind", "and", "fire", "earth"] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"): _ = string_lookup.StringLookup(vocabulary=vocab_path)
def test_no_vocab(self): with self.assertRaisesRegex(RuntimeError, "you must set the layer's vocabulary"): layer = string_lookup.StringLookup(output_mode="binary") layer([["a"]])
def __init__(self, max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, idf_weights=None, sparse=False, ragged=False, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( f"`TextVectorization` may only have a dtype of string. " f"Received dtype: {kwargs['dtype']}.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, WHITESPACE, CHARACTER, callable) layer_utils.validate_string_arg(split, allowable_strings=(WHITESPACE, CHARACTER), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( f"`ngrams` must be None, an integer, or a tuple of " f"integers. Received: ngrams={ngrams}") # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( f"`output_sequence_length` must be either None or an " f"integer when `output_mode` is 'int'. Received: " f"output_sequence_length={output_sequence_length}") if output_mode != INT and output_sequence_length is not None: raise ValueError( f"`output_sequence_length` must not be set if `output_mode` is not " f"'int'. Received output_sequence_length={output_sequence_length}." ) if ragged and output_mode != INT: raise ValueError(f"`ragged` must not be true if `output_mode` is " f"`'int'`. Received: ragged={ragged} and " f"output_mode={output_mode}") if ragged and output_sequence_length is not None: raise ValueError( f"`output_sequence_length` must not be set if ragged " f"is True. Received: ragged={ragged} and " f"output_sequence_length={output_sequence_length}") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._ragged = ragged self._output_mode = output_mode self._output_sequence_length = output_sequence_length # VocabularySavedModelSaver will clear the config vocabulary to restore the # lookup table ops directly. We persist this hidden option to persist the # fact that we have have a non-adaptable layer with a manually set vocab. self._has_input_vocabulary = kwargs.pop("has_input_vocabulary", (vocabulary is not None)) # Drop deprecated config options. kwargs.pop("vocabulary_size", None) super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._lookup_layer = string_lookup.StringLookup( max_tokens=max_tokens, vocabulary=vocabulary, idf_weights=idf_weights, pad_to_max_tokens=pad_to_max_tokens, mask_token="", output_mode=output_mode if output_mode is not None else INT, sparse=sparse, has_input_vocabulary=self._has_input_vocabulary)
def __init__(self, max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=(SPLIT_ON_WHITESPACE), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length vocabulary_size = 0 # IndexLookup needs to keep track the current vocab size outside of its # layer weights. We persist it as a hidden part of the config during # serialization. if "vocabulary_size" in kwargs: vocabulary_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] super(TextVectorization, self).__init__(combiner=None, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._index_lookup_layer = string_lookup.StringLookup( max_tokens=max_tokens, vocabulary=vocabulary, pad_to_max_tokens=pad_to_max_tokens, mask_token="", output_mode=output_mode if output_mode is not None else INT, vocabulary_size=vocabulary_size)
def test_no_vocab(self): with self.assertRaisesRegex(ValueError, "You must set the layer's vocabulary"): layer = string_lookup.StringLookup() layer([["a"]])
def test_non_unique_vocab_fails(self): vocab_data = ["earth", "wind", "and", "fire", "fire"] with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"): _ = string_lookup.StringLookup(vocabulary=vocab_data)