def define_kpls_for_training(self, use_adapt): # Define KPLs under strategy's scope. Right now, if they have look up # tables, they will be created on the client. Their variables will be # created on PS. Ideally they should be cached on each worker since they # will not be changed in a training step. if use_adapt: feature_lookup_layer = string_lookup.StringLookup( num_oov_indices=1) feature_lookup_layer.adapt(FEATURE_VOCAB) label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0, mask_token=None) label_lookup_layer.adapt(LABEL_VOCAB) else: feature_lookup_layer = string_lookup.StringLookup( vocabulary=FEATURE_VOCAB, num_oov_indices=1) label_lookup_layer = string_lookup.StringLookup( vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None) raw_feature_input = keras.layers.Input(shape=(3, ), dtype=dtypes.string, name="feature", ragged=True) feature_id_input = feature_lookup_layer(raw_feature_input) # Model creates variables as well. feature_ps = keras.Model({"features": raw_feature_input}, feature_id_input) raw_label_input = keras.layers.Input(shape=(), dtype=dtypes.string, name="label") label_id_input = label_lookup_layer(raw_label_input) label_ps = keras.Model({"label": raw_label_input}, label_id_input) return feature_ps, label_ps
def test_get_vocab_returns_str(self): vocab_data = ["earth", "wind", "and", "fire"] expected_vocab = ["[UNK]", "earth", "wind", "and", "fire"] layer = string_lookup.StringLookup(vocabulary=vocab_data) layer_vocab = layer.get_vocabulary() self.assertAllEqual(expected_vocab, layer_vocab) self.assertIsInstance(layer_vocab[0], str) inverse_layer = string_lookup.StringLookup( vocabulary=layer.get_vocabulary(), invert=True) layer_vocab = inverse_layer.get_vocabulary() self.assertAllEqual(expected_vocab, layer_vocab) self.assertIsInstance(layer_vocab[0], str)
def test_non_unique_vocab_from_file_fails(self): vocab_list = ["earth", "wind", "and", "fire", "earth"] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex( errors_impl.FailedPreconditionError, "HashTable has different value for same key.*earth"): _ = string_lookup.StringLookup(vocabulary=vocab_path)
def test_forward_backward_explicit_vocab(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "[UNK]"]]) input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup(vocabulary=vocab_data) invert_layer = string_lookup.StringLookup( vocabulary=vocab_data, invert=True) int_data = layer(input_data) out_data = invert_layer(int_data) model = keras.Model(inputs=input_data, outputs=out_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def embedding_varlen(self, batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab = fc_bm.create_vocabulary(32768) path = self._write_to_temp_file("tmp", vocab) data = fc_bm.create_string_data( max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add( keras.Input( shape=(max_length,), name="data", ragged=True, dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None)) # FC implementation fc = sfc.sequence_categorical_column_with_vocabulary_list( key="data", vocabulary_list=vocab, num_oov_buckets=1) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def test_sparse_output(self): vocab_data = ["earth", "wind", "and", "fire"] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup( vocabulary=vocab_data, output_mode="multi_hot", sparse=True) res = layer(input_data) self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
def define_reverse_lookup_layer(self): # Only needed for serving. label_inverse_lookup_layer = string_lookup.StringLookup( num_oov_indices=1, mask_token=None, vocabulary=LABEL_VOCAB, invert=True) return label_inverse_lookup_layer
def define_reverse_lookup_layer(self): """Create string reverse lookup layer for serving.""" label_inverse_lookup_layer = string_lookup.StringLookup( num_oov_indices=1, mask_token=None, vocabulary=self.LABEL_VOCAB, invert=True) return label_inverse_lookup_layer
def dataset_fn(input_context): del input_context lookup_layer = string_lookup.StringLookup(num_oov_indices=1, vocabulary=filepath) x = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) y = np.array([0, 1]) map_fn = lambda x, y: (lookup_layer(x), y) return dataset_ops.DatasetV2.from_tensor_slices( (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
def define_kpls_for_training(self, use_adapt): """Function that defines KPL used for unit tests of tf.distribute. Args: use_adapt: if adapt will be called. False means there will be precomputed statistics. Returns: feature_mapper: a simple keras model with one keras StringLookup layer which maps feature to index. label_mapper: similar to feature_mapper, but maps label to index. """ if use_adapt: feature_lookup_layer = (string_lookup.StringLookup( num_oov_indices=1)) feature_lookup_layer.adapt(self.FEATURE_VOCAB) label_lookup_layer = (string_lookup.StringLookup(num_oov_indices=0, mask_token=None)) label_lookup_layer.adapt(self.LABEL_VOCAB) else: feature_lookup_layer = (string_lookup.StringLookup( vocabulary=self.FEATURE_VOCAB, num_oov_indices=1)) label_lookup_layer = (string_lookup.StringLookup( vocabulary=self.LABEL_VOCAB, num_oov_indices=0, mask_token=None)) raw_feature_input = keras.layers.Input(shape=(3, ), dtype=dtypes.string, name="feature", ragged=True) feature_id_input = feature_lookup_layer(raw_feature_input) feature_mapper = keras.Model({"features": raw_feature_input}, feature_id_input) raw_label_input = keras.layers.Input(shape=(1, ), dtype=dtypes.string, name="label") label_id_input = label_lookup_layer(raw_label_input) label_mapper = keras.Model({"label": raw_label_input}, label_id_input) return feature_mapper, label_mapper
def test_tensor_vocab(self): vocab_data = ["[UNK]", "wind", "and", "fire"] vocab_tensor = constant_op.constant(vocab_data) layer = string_lookup.StringLookup(vocabulary=vocab_tensor) returned_vocab = layer.get_vocabulary() self.assertAllEqual(vocab_data, returned_vocab) self.assertAllEqual(layer.vocabulary_size(), 4) fn = def_function.function(lambda: layer.set_vocabulary(vocab_tensor)) with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"): fn()
def test_inverse_layer(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]) input_data = keras.Input(shape=(None,), dtype=dtypes.int64) layer = string_lookup.StringLookup(vocabulary=vocab_data, invert=True) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_explicit_vocab_with_special_tokens(self): vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup(vocabulary=vocab_data, mask_token="") int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_explicit_vocab(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup(vocabulary=vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_count_output(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "earth", "fire", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup( vocabulary=vocab_data, output_mode="count") res = layer(input_data) model = keras.Model(inputs=input_data, outputs=res) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_inverse_layer_from_file(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]]) expected_output = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "[UNK]"]]) vocab_path = self._write_to_temp_file("vocab_file", vocab_data) input_data = keras.Input(shape=(None,), dtype=dtypes.int64) layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_explicit_vocab_from_file(self): vocab_list = ["earth", "wind", "and", "fire"] vocab_path = self._write_to_temp_file("vocab_file", vocab_list) input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup(vocabulary=vocab_path) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_ragged_string_input_multi_bucket(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = ragged_factory_ops.constant([["earth", "wind", "fire"], ["fire", "and", "earth", "ohio"]]) expected_output = [[2, 3, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True) layer = string_lookup.StringLookup(num_oov_indices=2) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(input_array) self.assertAllEqual(expected_output, output_data)
def test_int_output_no_oov(self): vocab_data = ["earth", "wind", "and", "fire"] valid_input = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]) invalid_input = np.array([["earth", "wind", "and", "michigan"], ["fire", "and", "earth", "michigan"]]) expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) layer = string_lookup.StringLookup( vocabulary=vocab_data, mask_token="", num_oov_indices=0) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_data = model.predict(valid_input) self.assertAllEqual(expected_output, output_data) with self.assertRaisesRegex(errors.InvalidArgumentError, "found OOV values.*michigan"): _ = model.predict(invalid_input)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab_size = 32768 vocab = fc_bm.create_vocabulary(vocab_size) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None)) model.add( category_encoding.CategoryEncoding(num_tokens=vocab_size + 1, output_mode="count")) # FC implementation fc = fcv2.indicator_column( fcv2.categorical_column_with_vocabulary_list(key="data", vocabulary_list=vocab, num_oov_buckets=1)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def testTrainAndServe(self): # These vocabularies usually come from TFT or a Beam pipeline. feature_vocab = [ "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong", "wonder_woman" ] label_vocab = ["yes", "no"] with self.client.strategy.scope(): # Define KPLs under strategy's scope. Right now, if they have look up # tables, they will be created on the client. Their variables will be # created on PS. Ideally they should be cached on each worker since they # will not be changed in a training step. feature_lookup_layer = string_lookup.StringLookup() raw_feature_input = keras.layers.Input(shape=(3, ), dtype=dtypes.string, name="feature", ragged=True) feature_id_input = feature_lookup_layer(raw_feature_input) # Model creates variables as well. feature_ps = keras.Model({"features": raw_feature_input}, feature_id_input) # TODO(yuefengz): adapt may be expensive for large vocab? feature_lookup_layer.adapt(feature_vocab) label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0, mask_token=None) raw_label_input = keras.layers.Input(shape=(), dtype=dtypes.string, name="label") label_id_input = label_lookup_layer(raw_label_input) label_ps = keras.Model({"label": raw_label_input}, label_id_input) label_lookup_layer.adapt(label_vocab) # Only needed for serving. label_inverse_lookup_layer = string_lookup.StringLookup( num_oov_indices=1, mask_token=None, vocabulary=label_lookup_layer.get_vocabulary(), invert=True) def dataset_fn(): def feature_and_label_gen(): while True: features = random.sample(feature_vocab, 3) label = "yes" if "avenger" in features else "no" yield {"features": features, "label": label} # The dataset will be created on the client? raw_dataset = dataset_ops.Dataset.from_generator( feature_and_label_gen, output_types={ "features": dtypes.string, "label": dtypes.string }).shuffle(200).batch(32) preproc_dataset = raw_dataset.map( lambda x: { # pylint: disable=g-long-lambda "features": feature_ps(x["features"]), "label": label_ps(x["label"]) }) train_dataset = preproc_dataset.map(lambda x: ( # pylint: disable=g-long-lambda { "features": x["features"] }, [x["label"]])) return train_dataset distributed_dataset = self.client.create_per_worker_dataset( dataset_fn) model_input = keras.layers.Input(shape=(3, ), dtype=dtypes.int64, name="model_input") emb_output = keras.layers.Embedding(input_dim=len( feature_lookup_layer.get_vocabulary()), output_dim=20)(model_input) emb_output = math_ops.reduce_mean(emb_output, axis=1) dense_output = keras.layers.Dense(units=1, activation="sigmoid")(emb_output) model = keras.Model({"features": model_input}, dense_output) optimizer = rmsprop.RMSprop(learning_rate=0.01) accuracy = keras.metrics.Accuracy() @def_function.function def worker_fn(iterator): def train_step(iterator): batch_data, labels = next(iterator) with backprop.GradientTape() as tape: pred = model(batch_data, training=True) loss = nn.compute_average_loss( keras.losses.BinaryCrossentropy( reduction=loss_reduction.ReductionV2.NONE)( labels, pred)) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients( zip(gradients, model.trainable_variables)) actual_pred = math_ops.cast(math_ops.greater(pred, 0.5), dtypes.int64) accuracy.update_state(labels, actual_pred) self.client._strategy.run(train_step, args=(iterator, )) distributed_iterator = iter(distributed_dataset) for _ in range(10): self.client.schedule(worker_fn, args=(distributed_iterator, )) self.client.join() self.assertGreater(accuracy.result().numpy(), 0.0) # Create a saved model. model.feature_ps = feature_ps model.label_ps = label_ps model.label_inverse_lookup_layer = label_inverse_lookup_layer def create_serving_signature(model): @def_function.function def serve_fn(raw_features): raw_features = array_ops.expand_dims(raw_features, axis=0) transformed_features = model.feature_ps(raw_features) outputs = model(transformed_features) outputs = array_ops.squeeze(outputs, axis=0) outputs = math_ops.cast(math_ops.greater(outputs, 0.5), dtypes.int64) decoded_outputs = model.label_inverse_lookup_layer(outputs) return array_ops.squeeze(decoded_outputs, axis=0) # serving does NOT have batch dimension return serve_fn.get_concrete_function( tensor_spec.TensorSpec(shape=(3), dtype=dtypes.string, name="example")) serving_fn = create_serving_signature(model) saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) model.save(saved_model_dir, signatures={"serving_default": serving_fn}) # Test the saved_model. loaded_serving_fn = keras.saving.save.load_model( saved_model_dir).signatures["serving_default"] # check the result w/ and w/o avenger. prediction0 = loaded_serving_fn( constant_op.constant(["avenger", "ironman", "avenger"]))["output_0"] self.assertIn(prediction0, ("yes", "no")) prediction1 = loaded_serving_fn( constant_op.constant(["ironman", "ironman", "unkonwn"]))["output_0"] self.assertIn(prediction1, ("yes", "no"))
def test_non_unique_vocab_from_file_fails(self): vocab_list = ["earth", "wind", "and", "fire", "earth"] vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"): _ = string_lookup.StringLookup(vocabulary=vocab_path)
def test_no_vocab(self): with self.assertRaisesRegex( ValueError, "You must set the layer's vocabulary"): layer = string_lookup.StringLookup() layer([["a"]])
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != dtypes.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=(SPLIT_ON_WHITESPACE), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length vocabulary_size = 0 # IndexLookup needs to keep track the current vocab size outside of its # layer weights. We persist it as a hidden part of the config during # serialization. if "vocabulary_size" in kwargs: vocabulary_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] super(TextVectorization, self).__init__(combiner=None, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._index_lookup_layer = string_lookup.StringLookup( max_tokens=max_tokens, vocabulary=vocabulary, pad_to_max_tokens=pad_to_max_tokens, output_mode=output_mode if output_mode is not None else INT, vocabulary_size=vocabulary_size)
def test_non_unique_vocab_fails(self): vocab_data = ["earth", "wind", "and", "fire", "fire"] with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"): _ = string_lookup.StringLookup(vocabulary=vocab_data)