def define_kpls_for_training(self, use_adapt):
        # Define KPLs under strategy's scope. Right now, if they have look up
        # tables, they will be created on the client. Their variables will be
        # created on PS. Ideally they should be cached on each worker since they
        # will not be changed in a training step.
        if use_adapt:
            feature_lookup_layer = string_lookup.StringLookup(
                num_oov_indices=1)
            feature_lookup_layer.adapt(FEATURE_VOCAB)
            label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0,
                                                            mask_token=None)
            label_lookup_layer.adapt(LABEL_VOCAB)
        else:
            feature_lookup_layer = string_lookup.StringLookup(
                vocabulary=FEATURE_VOCAB, num_oov_indices=1)
            label_lookup_layer = string_lookup.StringLookup(
                vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None)

        raw_feature_input = keras.layers.Input(shape=(3, ),
                                               dtype=tf.string,
                                               name="feature",
                                               ragged=True)
        feature_id_input = feature_lookup_layer(raw_feature_input)

        # Model creates variables as well.
        feature_ps = keras.Model({"features": raw_feature_input},
                                 feature_id_input)

        raw_label_input = keras.layers.Input(shape=(1, ),
                                             dtype=tf.string,
                                             name="label")
        label_id_input = label_lookup_layer(raw_label_input)
        label_ps = keras.Model({"label": raw_label_input}, label_id_input)

        return feature_ps, label_ps
Exemple #2
0
    def test_get_vocab_returns_str(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        expected_vocab = ["", "[UNK]", "earth", "wind", "and", "fire"]
        layer = string_lookup.StringLookup(vocabulary=vocab_data)
        layer_vocab = layer.get_vocabulary()
        self.assertAllEqual(expected_vocab, layer_vocab)
        self.assertIsInstance(layer_vocab[0], str)

        inverse_layer = string_lookup.StringLookup(
            vocabulary=layer.get_vocabulary(), invert=True)
        layer_vocab = inverse_layer.get_vocabulary()
        self.assertAllEqual(expected_vocab, layer_vocab)
        self.assertIsInstance(layer_vocab[0], str)
def embedding_varlen(batch_size, max_length):
  """Benchmark a variable-length embedding."""
  # Data and constants.
  vocab = fc_bm.create_vocabulary(32768)
  data = fc_bm.create_string_data(
      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)

  # Keras implementation
  model = keras.Sequential()
  model.add(
      keras.Input(
          shape=(max_length,), name="data", ragged=True, dtype=tf.string))
  model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))

  # FC implementation
  fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
      key="data", vocabulary_list=vocab, num_oov_buckets=1)

  # Wrap the FC implementation in a tf.function for a fair comparison
  @tf_function()
  def fc_fn(tensors):
    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)

  # Benchmark runs
  keras_data = {"data": data}
  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

  fc_data = {"data": data.to_sparse()}
  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

  return k_avg_time, fc_avg_time
Exemple #4
0
    def test_forward_backward_explicit_vocab(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        expected_output = np.array([["earth", "wind", "and", "fire"],
                                    ["fire", "and", "earth", "[UNK]"]])

        input_data = keras.Input(shape=(None, ), dtype=tf.string)
        layer = string_lookup.StringLookup(vocabulary=vocab_data)
        invert_layer = string_lookup.StringLookup(vocabulary=vocab_data,
                                                  invert=True)
        int_data = layer(input_data)
        out_data = invert_layer(int_data)
        model = keras.Model(inputs=input_data, outputs=out_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #5
0
 def test_non_unique_vocab_from_file_fails(self):
     vocab_list = ["earth", "wind", "and", "fire", "earth"]
     vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
     with self.assertRaisesRegex(
             tf.errors.FailedPreconditionError,
             "HashTable has different value for same key.*earth"):
         _ = string_lookup.StringLookup(vocabulary=vocab_path)
Exemple #6
0
  def test_sparse_output(self):
    vocab_data = ["earth", "wind", "and", "fire"]

    input_data = keras.Input(shape=(None,), dtype=tf.string)
    layer = string_lookup.StringLookup(
        vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
    res = layer(input_data)
    self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
 def define_reverse_lookup_layer(self):
     # Only needed for serving.
     label_inverse_lookup_layer = string_lookup.StringLookup(
         num_oov_indices=0,
         mask_token=None,
         vocabulary=LABEL_VOCAB,
         invert=True)
     return label_inverse_lookup_layer
Exemple #8
0
    def define_reverse_lookup_layer(self):
        """Create string reverse lookup layer for serving."""

        label_inverse_lookup_layer = string_lookup.StringLookup(
            num_oov_indices=0,
            mask_token=None,
            vocabulary=self.LABEL_VOCAB,
            invert=True)
        return label_inverse_lookup_layer
Exemple #9
0
 def test_tensor_vocab(self):
   vocab_data = ["[UNK]", "wind", "and", "fire"]
   vocab_tensor = tf.constant(vocab_data)
   layer = string_lookup.StringLookup(vocabulary=vocab_tensor)
   returned_vocab = layer.get_vocabulary()
   self.assertAllEqual(vocab_data, returned_vocab)
   self.assertAllEqual(layer.vocabulary_size(), 4)
   fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
   with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
     fn()
Exemple #10
0
 def dataset_fn(input_context):
   del input_context
   lookup_layer = string_lookup.StringLookup(
       num_oov_indices=1, vocabulary=filepath)
   x = np.array([["earth", "wind", "and", "fire"],
                 ["fire", "and", "earth", "michigan"]])
   y = np.array([0, 1])
   map_fn = lambda x, y: (lookup_layer(x), y)
   return tf.data.Dataset.from_tensor_slices(
       (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
Exemple #11
0
    def define_kpls_for_training(self, use_adapt):
        """Function that defines KPL used for unit tests of tf.distribute.

    Args:
      use_adapt: if adapt will be called. False means there will be precomputed
        statistics.

    Returns:
      feature_mapper: a simple keras model with one keras StringLookup layer
      which maps feature to index.
      label_mapper: similar to feature_mapper, but maps label to index.

    """
        if use_adapt:
            feature_lookup_layer = (string_lookup.StringLookup(
                num_oov_indices=1))
            feature_lookup_layer.adapt(self.FEATURE_VOCAB)
            label_lookup_layer = (string_lookup.StringLookup(num_oov_indices=0,
                                                             mask_token=None))
            label_lookup_layer.adapt(self.LABEL_VOCAB)
        else:
            feature_lookup_layer = (string_lookup.StringLookup(
                vocabulary=self.FEATURE_VOCAB, num_oov_indices=1))
            label_lookup_layer = (string_lookup.StringLookup(
                vocabulary=self.LABEL_VOCAB,
                num_oov_indices=0,
                mask_token=None))

        raw_feature_input = keras.layers.Input(shape=(3, ),
                                               dtype=tf.string,
                                               name="feature",
                                               ragged=True)
        feature_id_input = feature_lookup_layer(raw_feature_input)
        feature_mapper = keras.Model({"features": raw_feature_input},
                                     feature_id_input)

        raw_label_input = keras.layers.Input(shape=(1, ),
                                             dtype=tf.string,
                                             name="label")
        label_id_input = label_lookup_layer(raw_label_input)
        label_mapper = keras.Model({"label": raw_label_input}, label_id_input)

        return feature_mapper, label_mapper
Exemple #12
0
    def test_inverse_layer(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
        expected_output = np.array([["earth", "wind", "and", "fire"],
                                    ["fire", "and", "earth", ""]])

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = string_lookup.StringLookup(vocabulary=vocab_data, invert=True)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #13
0
    def test_int_output_explicit_vocab_with_special_tokens(self):
        vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.string)
        layer = string_lookup.StringLookup(vocabulary=vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
    def test_inverse_layer_from_file(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
        expected_output = np.array([["earth", "wind", "and", "fire"],
                                    ["fire", "and", "earth", "[UNK]"]])
        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #15
0
    def test_count_output(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        input_array = np.array([["earth", "earth", "fire", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.string)
        layer = string_lookup.StringLookup(vocabulary=vocab_data,
                                           output_mode="count")
        res = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=res)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #16
0
    def test_ragged_string_input_multi_bucket(self):
        vocab_data = ["earth", "wind", "and", "fire"]
        input_array = tf.ragged.constant([["earth", "wind", "fire"],
                                          ["fire", "and", "earth", "ohio"]])
        expected_output = [[3, 4, 6], [6, 5, 3, 2]]

        input_data = keras.Input(shape=(None, ), dtype=tf.string, ragged=True)
        layer = string_lookup.StringLookup(num_oov_indices=2)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #17
0
    def test_int_output_explicit_vocab_from_file(self):
        vocab_list = ["earth", "wind", "and", "fire"]
        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)

        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.string)
        layer = string_lookup.StringLookup(vocabulary=vocab_path)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
Exemple #18
0
  def test_int_output_no_oov(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    valid_input = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", ""]])
    invalid_input = np.array([["earth", "wind", "and", "michigan"],
                              ["fire", "and", "earth", "michigan"]])
    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

    input_data = keras.Input(shape=(None,), dtype=tf.string)
    layer = string_lookup.StringLookup(
        vocabulary=vocab_data, mask_token="", num_oov_indices=0)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(valid_input)
    self.assertAllEqual(expected_output, output_data)
    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                "found OOV values.*michigan"):
      _ = model.predict(invalid_input)
Exemple #19
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab_size = 32768
    vocab = fc_bm.create_vocabulary(vocab_size)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=tf.string))
    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
    model.add(
        category_encoding.CategoryEncoding(num_tokens=vocab_size + 1,
                                           output_mode="count"))

    # FC implementation
    fc = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="data", vocabulary_list=vocab, num_oov_buckets=1))

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Exemple #20
0
 def test_non_unique_vocab_from_file_fails(self):
     vocab_list = ["earth", "wind", "and", "fire", "earth"]
     vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
         _ = string_lookup.StringLookup(vocabulary=vocab_path)
 def test_no_vocab(self):
     with self.assertRaisesRegex(RuntimeError,
                                 "you must set the layer's vocabulary"):
         layer = string_lookup.StringLookup(output_mode="binary")
         layer([["a"]])
Exemple #22
0
    def __init__(self,
                 max_tokens=None,
                 standardize="lower_and_strip_punctuation",
                 split="whitespace",
                 ngrams=None,
                 output_mode="int",
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 idf_weights=None,
                 sparse=False,
                 ragged=False,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                f"`TextVectorization` may only have a dtype of string. "
                f"Received dtype: {kwargs['dtype']}.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of
        # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION, LOWER,
                               STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, WHITESPACE, CHARACTER, callable)
        layer_utils.validate_string_arg(split,
                                        allowable_strings=(WHITESPACE,
                                                           CHARACTER),
                                        layer_name="TextVectorization",
                                        arg_name="split",
                                        allow_none=True,
                                        allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                f"`ngrams` must be None, an integer, or a tuple of "
                f"integers. Received: ngrams={ngrams}")

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                f"`output_sequence_length` must be either None or an "
                f"integer when `output_mode` is 'int'. Received: "
                f"output_sequence_length={output_sequence_length}")

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError(
                f"`output_sequence_length` must not be set if `output_mode` is not "
                f"'int'. Received output_sequence_length={output_sequence_length}."
            )

        if ragged and output_mode != INT:
            raise ValueError(f"`ragged` must not be true if `output_mode` is "
                             f"`'int'`. Received: ragged={ragged} and "
                             f"output_mode={output_mode}")

        if ragged and output_sequence_length is not None:
            raise ValueError(
                f"`output_sequence_length` must not be set if ragged "
                f"is True. Received: ragged={ragged} and "
                f"output_sequence_length={output_sequence_length}")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams
        self._ragged = ragged

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length

        # VocabularySavedModelSaver will clear the config vocabulary to restore the
        # lookup table ops directly. We persist this hidden option to persist the
        # fact that we have have a non-adaptable layer with a manually set vocab.
        self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
                                                (vocabulary is not None))

        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            idf_weights=idf_weights,
            pad_to_max_tokens=pad_to_max_tokens,
            mask_token="",
            output_mode=output_mode if output_mode is not None else INT,
            sparse=sparse,
            has_input_vocabulary=self._has_input_vocabulary)
    def __init__(self,
                 max_tokens=None,
                 standardize="lower_and_strip_punctuation",
                 split="whitespace",
                 ngrams=None,
                 output_mode="int",
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 **kwargs):
        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        vocabulary_size = 0
        # IndexLookup needs to keep track the current vocab size outside of its
        # layer weights. We persist it as a hidden part of the config during
        # serialization.
        if "vocabulary_size" in kwargs:
            vocabulary_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        super(TextVectorization, self).__init__(combiner=None, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._index_lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            mask_token="",
            output_mode=output_mode if output_mode is not None else INT,
            vocabulary_size=vocabulary_size)
Exemple #24
0
 def test_no_vocab(self):
     with self.assertRaisesRegex(ValueError,
                                 "You must set the layer's vocabulary"):
         layer = string_lookup.StringLookup()
         layer([["a"]])
Exemple #25
0
 def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
         _ = string_lookup.StringLookup(vocabulary=vocab_data)