Ejemplo n.º 1
0
    def __init__(self, num_bins, output_mode="int", sparse=False, **kwargs):
        # By default, output int64 when output_mode="int" and floats otherwise.
        if "dtype" not in kwargs or kwargs["dtype"] is None:
            kwargs[
                "dtype"] = tf.int64 if output_mode == INT else backend.floatx(
                )

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "HashedCrossing").set(True)

        # Check dtype only after base layer parses it; dtype parsing is complex.
        if output_mode == INT and not tf.as_dtype(
                self.compute_dtype).is_integer:
            input_dtype = kwargs["dtype"]
            raise ValueError(
                "When `output_mode='int'`, `dtype` should be an integer "
                f"type. Received: dtype={input_dtype}")

        # "output_mode" must be one of (INT, ONE_HOT)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, ONE_HOT),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.num_bins = num_bins
        self.output_mode = output_mode
        self.sparse = sparse
Ejemplo n.º 2
0
    def __init__(self,
                 num_tokens=None,
                 output_mode=BINARY,
                 sparse=False,
                 **kwargs):
        # max_tokens is an old name for the num_tokens arg we continue to support
        # because of usage.
        if "max_tokens" in kwargs:
            logging.warning(
                "max_tokens is deprecated, please use num_tokens instead.")
            num_tokens = kwargs["max_tokens"]
            del kwargs["max_tokens"]

        super(CategoryEncoding, self).__init__(**kwargs)

        # 'output_mode' must be one of (COUNT, BINARY)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, BINARY),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        if num_tokens is None:
            raise ValueError(
                "num_tokens must be set to use this layer. If the "
                "number of tokens is not known beforehand, use the "
                "IntegerLookup layer instead.")
        if num_tokens < 1:
            raise ValueError("num_tokens must be >= 1.")

        self.num_tokens = num_tokens
        self.output_mode = output_mode
        self.sparse = sparse
Ejemplo n.º 3
0
  def __init__(self,
               num_bins,
               mask_value=None,
               salt=None,
               output_mode='int',
               sparse=False,
               **kwargs):
    if num_bins is None or num_bins <= 0:
      raise ValueError(
          f'The `num_bins` for `Hashing` cannot be `None` or non-positive '
          f'values. Received: num_bins={num_bins}.')

    # By default, output int64 when output_mode='int' and floats otherwise.
    if 'dtype' not in kwargs or kwargs['dtype'] is None:
      kwargs['dtype'] = tf.int64 if output_mode == INT else backend.floatx()
    elif output_mode == 'int' and not tf.as_dtype(kwargs['dtype']).is_integer:
      # Compat for when dtype was always floating and ignored by the layer.
      kwargs['dtype'] = tf.int64

    super().__init__(**kwargs)
    base_preprocessing_layer.keras_kpl_gauge.get_cell('Hashing').set(True)

    # Check dtype only after base layer parses it; dtype parsing is complex.
    if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer:
      input_dtype = kwargs['dtype']
      raise ValueError('When `output_mode="int"`, `dtype` should be an integer '
                       f'type. Received: dtype={input_dtype}')

    # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
        layer_name=self.__class__.__name__,
        arg_name='output_mode')

    if sparse and output_mode == INT:
      raise ValueError(f'`sparse` may only be true if `output_mode` is '
                       f'`"one_hot"`, `"multi_hot"`, or `"count"`. '
                       f'Received: sparse={sparse} and '
                       f'output_mode={output_mode}')

    self.num_bins = num_bins
    self.mask_value = mask_value
    self.strong_hash = True if salt is not None else False
    self.output_mode = output_mode
    self.sparse = sparse
    self.salt = None
    if salt is not None:
      if isinstance(salt, (tuple, list)) and len(salt) == 2:
        self.salt = salt
      elif isinstance(salt, int):
        self.salt = [salt, salt]
      else:
        raise ValueError(
            f'The `salt` argument for `Hashing` can only be a tuple of size 2 '
            f'integers, or a single integer. Received: salt={salt}.')
Ejemplo n.º 4
0
    def __init__(self,
                 max_tokens=None,
                 output_mode=BINARY,
                 sparse=False,
                 **kwargs):
        # 'output_mode' must be one of (COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, BINARY,
                                                           TFIDF),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        # We need to call super() before we call _add_state_variable().
        combiner = _CategoryEncodingCombiner(max_tokens=max_tokens,
                                             compute_idf=output_mode == TFIDF)
        super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "CategoryEncoding").set(True)

        self.max_tokens = max_tokens
        self.output_mode = output_mode
        self.sparse = sparse
        self._called = False

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            self.tf_idf_weights = self._add_state_variable(
                name=_IDF_NAME,
                shape=tf.TensorShape((max_tokens, )),
                dtype=K.floatx(),
                initializer=initializer)

            self.input_spec = InputSpec(ndim=2)
Ejemplo n.º 5
0
    def __init__(self,
                 num_tokens=None,
                 output_mode="multi_hot",
                 sparse=False,
                 **kwargs):
        # max_tokens is an old name for the num_tokens arg we continue to
        # support because of usage.
        if "max_tokens" in kwargs:
            logging.warning(
                "max_tokens is deprecated, please use num_tokens instead.")
            num_tokens = kwargs["max_tokens"]
            del kwargs["max_tokens"]

        # By default, output floats. This is already default for TF2, but in TF1
        # dtype is inferred from inputs, and would default to int.
        if "dtype" not in kwargs:
            kwargs["dtype"] = backend.floatx()

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "CategoryEncoding").set(True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
        layer_utils.validate_string_arg(
            output_mode,
            allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
            layer_name="CategoryEncoding",
            arg_name="output_mode",
        )

        if num_tokens is None:
            raise ValueError(
                "num_tokens must be set to use this layer. If the "
                "number of tokens is not known beforehand, use the "
                "IntegerLookup layer instead.")
        if num_tokens < 1:
            raise ValueError(
                f"`num_tokens` must be >= 1. Received: num_tokens={num_tokens}."
            )

        self.num_tokens = num_tokens
        self.output_mode = output_mode
        self.sparse = sparse
Ejemplo n.º 6
0
    def __init__(self,
                 num_tokens=None,
                 output_mode=MULTI_HOT,
                 sparse=False,
                 **kwargs):
        # max_tokens is an old name for the num_tokens arg we continue to support
        # because of usage.
        if "max_tokens" in kwargs:
            logging.warning(
                "max_tokens is deprecated, please use num_tokens instead.")
            num_tokens = kwargs["max_tokens"]
            del kwargs["max_tokens"]

        super(CategoryEncoding, self).__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "CategoryEncoding").set(True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, ONE_HOT,
                                                           MULTI_HOT),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        if num_tokens is None:
            raise ValueError(
                "num_tokens must be set to use this layer. If the "
                "number of tokens is not known beforehand, use the "
                "IntegerLookup layer instead.")
        if num_tokens < 1:
            raise ValueError("num_tokens must be >= 1.")

        self.num_tokens = num_tokens
        self.output_mode = output_mode
        self.sparse = sparse
Ejemplo n.º 7
0
    def __init__(
        self,
        bin_boundaries=None,
        num_bins=None,
        epsilon=0.01,
        output_mode="int",
        sparse=False,
        **kwargs,
    ):
        # bins is a deprecated arg for setting bin_boundaries or num_bins that still
        # has some usage.
        if "bins" in kwargs:
            logging.warning(
                "bins is deprecated, please use bin_boundaries or num_bins instead."
            )
            if isinstance(kwargs["bins"], int) and num_bins is None:
                num_bins = kwargs["bins"]
            elif bin_boundaries is None:
                bin_boundaries = kwargs["bins"]
            del kwargs["bins"]

        # By default, output int64 when output_mode='int' and floats otherwise.
        if "dtype" not in kwargs or kwargs["dtype"] is None:
            kwargs["dtype"] = (
                tf.int64 if output_mode == INT else backend.floatx()
            )
        elif (
            output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer
        ):
            # Compat for when dtype was always floating and ignored by the layer.
            kwargs["dtype"] = tf.int64

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set(
            True
        )

        # Check dtype only after base layer parses it; dtype parsing is complex.
        if (
            output_mode == INT
            and not tf.as_dtype(self.compute_dtype).is_integer
        ):
            input_dtype = kwargs["dtype"]
            raise ValueError(
                "When `output_mode='int'`, `dtype` should be an integer "
                f"type. Received: dtype={input_dtype}"
            )

        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
        layer_utils.validate_string_arg(
            output_mode,
            allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
            layer_name=self.__class__.__name__,
            arg_name="output_mode",
        )

        if sparse and output_mode == INT:
            raise ValueError(
                f"`sparse` may only be true if `output_mode` is "
                f"`'one_hot'`, `'multi_hot'`, or `'count'`. "
                f"Received: sparse={sparse} and "
                f"output_mode={output_mode}"
            )

        if num_bins is not None and num_bins < 0:
            raise ValueError(
                "`num_bins` must be greater than or equal to 0. "
                "You passed `num_bins={}`".format(num_bins)
            )
        if num_bins is not None and bin_boundaries is not None:
            raise ValueError(
                "Both `num_bins` and `bin_boundaries` should not be "
                "set. You passed `num_bins={}` and "
                "`bin_boundaries={}`".format(num_bins, bin_boundaries)
            )
        bin_boundaries = utils.listify_tensors(bin_boundaries)
        self.input_bin_boundaries = bin_boundaries
        self.bin_boundaries = (
            bin_boundaries if bin_boundaries is not None else []
        )
        self.num_bins = num_bins
        self.epsilon = epsilon
        self.output_mode = output_mode
        self.sparse = sparse
Ejemplo n.º 8
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False
        self._num_special_tokens = self.num_oov_indices
        if self.mask_token is not None:
            self._num_special_tokens += 1
        self._vocab_size = 0
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocab_size" in kwargs:
            self._vocab_size = kwargs["vocab_size"]
            del kwargs["vocab_size"]

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._num_special_tokens
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            oov_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            oov_value = self._oov_value
            if self.num_oov_indices <= 1:
                oov_indices = None
            else:
                oov_start = 1 if mask_token is not None else 0
                oov_end = oov_start + num_oov_indices
                oov_indices = list(range(oov_start, oov_end))

        if vocabulary is not None and isinstance(
                vocabulary, tf.lookup.TextFileInitializer):
            self._table = self._static_table_class()(vocabulary,
                                                     default_value=oov_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=mask_token,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            self.max_tokens = (self._table_handler.table_size() +
                               self.num_oov_indices +
                               (0 if mask_token is None else 1))
        else:
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=oov_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=K.floatx(),
                initializer=initializer)

        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))
Ejemplo n.º 9
0
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=True,
                 vocabulary=None,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT, BINARY,
                                                           TFIDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        self._max_tokens = max_tokens

        # In INT mode, the zero value is reserved for padding (per Keras standard
        # padding approaches). In non-INT modes, there is no padding so we can set
        # the OOV value to zero instead of one.
        self._oov_value = 1 if output_mode == INT else 0

        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        self._pad_to_max = pad_to_max_tokens
        self._vocab_size = 0

        super(TextVectorization, self).__init__(combiner=None, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        mask_token = "" if output_mode in [None, INT] else None
        self._index_lookup_layer = self._get_index_lookup_class()(
            max_tokens=max_tokens,
            mask_token=mask_token,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            output_mode=output_mode if output_mode is not None else INT)
Ejemplo n.º 10
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary_dtype,
                 vocabulary=None,
                 idf_weights=None,
                 invert=False,
                 output_mode="int",
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError(f"If set, `max_tokens` must be greater than 1. "
                             f"Received: max_tokens={max_tokens}")

        if pad_to_max_tokens and max_tokens is None:
            raise ValueError(
                f"If pad_to_max_tokens is True, must set `max_tokens`. "
                f"Received: max_tokens={max_tokens}")

        if num_oov_indices < 0:
            raise ValueError(
                f"`num_oov_indices` must be greater than or equal to 0. "
                f"Received: num_oov_indices={num_oov_indices}")

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, ONE_HOT,
                                                           MULTI_HOT, COUNT,
                                                           TF_IDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                f"`output_mode` must be `'int'` when `invert` is true. "
                f"Received: output_mode={output_mode}")

        if sparse and output_mode == INT:
            raise ValueError(
                f"`sparse` may only be true if `output_mode` is "
                f"`'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. "
                f"Received: sparse={sparse} and "
                f"output_mode={output_mode}")

        if idf_weights is not None and output_mode != TF_IDF:
            raise ValueError(
                f"`idf_weights` should only be set if `output_mode` is "
                f"`'tf_idf'`. Received: idf_weights={idf_weights} and "
                f"output_mode={output_mode}")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.mask_token = mask_token
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self.vocabulary_dtype = vocabulary_dtype
        self._frozen_vocab_size = None

        self.input_vocabulary = vocabulary
        self.input_idf_weights = idf_weights
        # VocabularySavedModelSaver will clear the config vocabulary to restore the
        # lookup table ops directly. We persist this hidden option to persist the
        # fact that we have have a non-adaptable layer with a manually set vocab.
        self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
                                                (vocabulary is not None))

        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)
        kwargs.pop("has_static_table", None)

        # By default, output int64 when output_mode='int' and floats otherwise.
        if "dtype" not in kwargs:
            kwargs[
                "dtype"] = tf.int64 if output_mode == INT else backend.floatx(
                )

        super().__init__(**kwargs)

        # Check dtype only after base layer parses it; dtype parsing is complex.
        if output_mode == INT and not tf.as_dtype(
                self.compute_dtype).is_integer:
            input_dtype = kwargs["dtype"]
            raise ValueError(
                "When `output_mode='int'`, `dtype` should be an integer "
                f"type. Received: dtype={input_dtype}")

        if invert:
            self._key_dtype = self.dtype if output_mode == INT else tf.int64
            self._value_dtype = tf.as_dtype(self.vocabulary_dtype)
            mask_key = 0
            mask_value = mask_token
            self._default_value = self.oov_token
        else:
            self._key_dtype = tf.as_dtype(self.vocabulary_dtype)
            self._value_dtype = self.dtype if output_mode == INT else tf.int64
            mask_key = mask_token
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            mask_value = 0 if self.output_mode == INT else self._value_dtype.max
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 and error out
                # during call if we find a negative index.
                self._default_value = -1
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                self._default_value = self._oov_start_index()
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                self._default_value = -1
        if self.mask_token is not None:
            self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype)
            self._mask_value = tf.convert_to_tensor(mask_value,
                                                    self._value_dtype)

        if self.output_mode == TF_IDF:
            self.idf_weights = tf.Variable([0] * self._token_start_index(),
                                           shape=(None, ),
                                           dtype=self.compute_dtype,
                                           trainable=False)
            self.idf_weights_const = self.idf_weights.value()

        if vocabulary is not None:
            self.set_vocabulary(vocabulary, idf_weights)
        else:
            # When restoring from a keras SavedModel, the loading code will expect to
            # find and restore a lookup_table attribute on the layer. This table needs
            # to be uninitialized as a StaticHashTable cannot be initialized twice.
            self.lookup_table = self._uninitialized_lookup_table()

        # Only set up adapt state if we did not recieve a vocab on construction.
        if not self._has_input_vocabulary:
            # Add a custom weight handler to return the layers vocab as it's weight.
            self._add_trackable(VocabWeightHandler(self), False)
            # Set adapt state.
            self.token_counts = tf.lookup.experimental.MutableHashTable(
                key_dtype=vocabulary_dtype,
                value_dtype=tf.int64,
                default_value=0)
            if self.output_mode == TF_IDF:
                self.token_document_counts = tf.lookup.experimental.MutableHashTable(
                    key_dtype=vocabulary_dtype,
                    value_dtype=tf.int64,
                    default_value=0)
                self.num_documents = tf.Variable(0,
                                                 dtype=tf.int64,
                                                 trainable=False)
Ejemplo n.º 11
0
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT, BINARY,
                                                           TFIDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        vocabulary_size = 0
        # IndexLookup needs to keep track the current vocab size outside of its
        # layer weights. We persist it as a hidden part of the config during
        # serialization.
        if "vocabulary_size" in kwargs:
            vocabulary_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        super(TextVectorization, self).__init__(combiner=None, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._index_lookup_layer = self._get_index_lookup_class()(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            output_mode=output_mode if output_mode is not None else INT,
            vocabulary_size=vocabulary_size)
Ejemplo n.º 12
0
    def __init__(self,
                 max_tokens=None,
                 standardize="lower_and_strip_punctuation",
                 split="whitespace",
                 ngrams=None,
                 output_mode="int",
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 idf_weights=None,
                 sparse=False,
                 ragged=False,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                f"`TextVectorization` may only have a dtype of string. "
                f"Received dtype: {kwargs['dtype']}.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of
        # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION, LOWER,
                               STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, WHITESPACE, CHARACTER, callable)
        layer_utils.validate_string_arg(split,
                                        allowable_strings=(WHITESPACE,
                                                           CHARACTER),
                                        layer_name="TextVectorization",
                                        arg_name="split",
                                        allow_none=True,
                                        allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                f"`ngrams` must be None, an integer, or a tuple of "
                f"integers. Received: ngrams={ngrams}")

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                f"`output_sequence_length` must be either None or an "
                f"integer when `output_mode` is 'int'. Received: "
                f"output_sequence_length={output_sequence_length}")

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError(
                f"`output_sequence_length` must not be set if `output_mode` is not "
                f"'int'. Received output_sequence_length={output_sequence_length}."
            )

        if ragged and output_mode != INT:
            raise ValueError(f"`ragged` must not be true if `output_mode` is "
                             f"`'int'`. Received: ragged={ragged} and "
                             f"output_mode={output_mode}")

        if ragged and output_sequence_length is not None:
            raise ValueError(
                f"`output_sequence_length` must not be set if ragged "
                f"is True. Received: ragged={ragged} and "
                f"output_sequence_length={output_sequence_length}")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams
        self._ragged = ragged

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length

        # VocabularySavedModelSaver will clear the config vocabulary to restore the
        # lookup table ops directly. We persist this hidden option to persist the
        # fact that we have have a non-adaptable layer with a manually set vocab.
        self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
                                                (vocabulary is not None))

        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            idf_weights=idf_weights,
            pad_to_max_tokens=pad_to_max_tokens,
            mask_token="",
            output_mode=output_mode if output_mode is not None else INT,
            sparse=sparse,
            has_input_vocabulary=self._has_input_vocabulary)
Ejemplo n.º 13
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                "`output_mode` must be {} when `invert` is true. You "
                "passed {}".format(INT, output_mode))

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False

        # A note on vocab_size: we need to always keep a non-Tensor representation
        # of vocab_size around to use in graph building. Because we might be
        # in a tf.function, we can't rely on evaluating the actual tables to
        # find the value either.
        self._vocab_size = None
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocabulary_size" in kwargs:
            self._vocab_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        restore_from_static_table = kwargs.pop("has_static_table", False)

        # Make sure the mask token is truly of the dtype we want. We can ignore
        # strings here, because they have only one dtype.
        if mask_token is not None:
            dtype = kwargs["dtype"]
            if dtype == tf.int32:
                mask_token = np.int32(mask_token)
            elif dtype == tf.int64:
                mask_token = np.int64(mask_token)
        self.mask_token = mask_token

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._token_start_index()
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            self._mask_key = 0
            self._mask_value = mask_token
            key_index = tf.lookup.TextFileIndex.LINE_NUMBER
            value_index = tf.lookup.TextFileIndex.WHOLE_LINE
            default_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            self._mask_key = mask_token
            key_index = tf.lookup.TextFileIndex.WHOLE_LINE
            value_index = tf.lookup.TextFileIndex.LINE_NUMBER
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            self._mask_value = 0 if self.output_mode == INT else tf.int64.max
            oov_start = self._oov_start_index()
            token_start = self._token_start_index()
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 for int output
                # and drop them from bagged output. Max ints will be dropped from the
                # bincount op.
                default_value = -1 if self.output_mode == INT else tf.int64.max
                oov_indices = None
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                default_value = oov_start
                oov_indices = None
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                default_value = -1
                oov_indices = list(range(oov_start, token_start))

        self._static_vocabulary_path = None
        has_vocab_path = (vocabulary is not None
                          and isinstance(vocabulary, str))
        if has_vocab_path or restore_from_static_table:
            self._has_static_table = True
            if vocabulary is None:
                # If we're restoring a layer that was saved with a static table
                # initializer, we create a fake initializer object to let the code
                # progress. The savedmodel restoration code will handle restoring
                # the actual data.
                initializer = _NullInitializer(self._key_dtype,
                                               self._value_dtype)
            else:
                if not os.path.exists(vocabulary):
                    raise ValueError("Vocabulary file %s does not exist." %
                                     (vocabulary, ))
                self._static_vocabulary_path = vocabulary
                num_tokens = table_utils.num_tokens_in_file(vocabulary)
                self._vocab_size = self._token_start_index() + num_tokens

                initializer = tf.lookup.TextFileInitializer(
                    filename=vocabulary,
                    key_dtype=self._key_dtype,
                    key_index=key_index,
                    value_dtype=self._value_dtype,
                    value_index=value_index,
                    value_index_offset=self._token_start_index())

            self._table = self._static_table_class()(
                initializer, default_value=default_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=self._mask_key,
                mask_value=self._mask_value,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())

            tracked_table = self._add_trackable(self._table, trainable=False)

        else:
            self._has_static_table = False
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=default_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)
            tracked_table = self._add_trackable(self._table, trainable=False)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=backend.floatx(),
                initializer=initializer)

        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))
Ejemplo n.º 14
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode="int",
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed `max_tokens={}`".format(max_tokens))

        if pad_to_max_tokens and max_tokens is None:
            raise ValueError(
                "If pad_to_max_tokens is True, must set `max_tokens`. "
                "You passed `max_tokens={}`".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, ONE_HOT,
                                                           MULTI_HOT, COUNT,
                                                           TF_IDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                "`output_mode` must be {} when `invert` is true. You "
                "passed {}".format(INT, output_mode))

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.mask_token = mask_token
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self.input_vocabulary = None
        # IndexLookupLayerSavedModelSaver will clear the config config vocabulary to
        # restore the lookup table ops directly. We persist this hidden option to
        # persist the fact that we have have a non-adaptable layer with a manually
        # set vocabulary.
        self._has_input_vocabulary = kwargs.pop("has_input_vocabulary", False)
        self._frozen_vocab_size = None

        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)
        kwargs.pop("has_static_table", None)

        super().__init__(**kwargs)

        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = tf.as_dtype(self.dtype)
            mask_key = 0
            mask_value = mask_token
            self._default_value = self.oov_token
        else:
            self._key_dtype = tf.as_dtype(self.dtype)
            self._value_dtype = tf.int64
            mask_key = mask_token
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            mask_value = 0 if self.output_mode == INT else tf.int64.max
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 and error out
                # during call if we find a negative index.
                self._default_value = -1
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                self._default_value = self._oov_start_index()
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                self._default_value = -1
        if self.mask_token is not None:
            self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype)
            self._mask_value = tf.convert_to_tensor(mask_value,
                                                    self._value_dtype)

        if self.output_mode == TF_IDF:
            self.idf_weights = tf.Variable([0] * self._token_start_index(),
                                           shape=(None, ),
                                           dtype=backend.floatx(),
                                           trainable=False)
            self.idf_weights_const = self.idf_weights.value()

        if vocabulary is not None:
            self.set_vocabulary(vocabulary)
        else:
            # When restoring from a keras SavedModel, the loading code will expect to
            # find and restore a lookup_table attribute on the layer. This table needs
            # to be uninitialized as a StaticHashTable cannot be initialized twice.
            self.lookup_table = self._uninitialized_lookup_table()
            if not self._has_input_vocabulary:
                # Add a custom weight handler to return the layers vocab as it's weight.
                self._add_trackable(VocabWeightHandler(self), False)
                # Set adapt state.
                self.token_counts = tf.lookup.experimental.MutableHashTable(
                    key_dtype=self.dtype,
                    value_dtype=tf.int64,
                    default_value=0)
                if self.output_mode == TF_IDF:
                    self.token_document_counts = tf.lookup.experimental.MutableHashTable(
                        key_dtype=self.dtype,
                        value_dtype=tf.int64,
                        default_value=0)
                    self.num_documents = tf.Variable(0,
                                                     dtype=tf.int64,
                                                     trainable=False)
Ejemplo n.º 15
0
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               **kwargs):

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, max_tokens must be greater than 1. "
                       "You passed %s" % (max_tokens,))

    if num_oov_indices < 0:
      raise ValueError("`num_oov_indices` must be greater than 0. You passed "
                       "%s" % (num_oov_indices,))

    if invert and num_oov_indices != 1:
      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")

    # 'output_mode' must be one of (INT, BINARY, COUNT)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, BINARY, COUNT),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.oov_token = oov_token
    self.mask_token = mask_token
    self.output_mode = output_mode
    self.sparse = sparse

    # If there is only one OOV bucket, we can determine the OOV value (either 0
    # or 1 depending on whether 0 is reserved) and set that as the default
    # value of the index_lookup table. If we hav multiple OOV values, we need to
    # do a further hashing step; to make this easier, we set the OOV value to
    # -1. (This lets us do a vectorized add and cast to boolean to determine
    # locations where we need to do extra hashing.)
    if self.num_oov_indices == 1:
      self._oov_value = 0 if mask_token is None else 1
    else:
      self._oov_value = -1

    if max_tokens is not None:
      num_mask_tokens = (0 if mask_token is None else 1)
      vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
    else:
      vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(vocab_size, self.mask_token), **kwargs)

    self._output_dtype = tf.int64

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = self._output_dtype
      value_dtype = self.dtype
      oov_value = self.oov_token
    else:
      self._key_dtype = self.dtype
      value_dtype = self._output_dtype
      oov_value = self._oov_value

    self._table = lookup_ops.MutableHashTable(
        key_dtype=self._key_dtype,
        value_dtype=value_dtype,
        default_value=oov_value,
        name=(self._name + "_index_table"))
    tracked_table = self._add_trackable(self._table, trainable=False)
    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tf.TensorShape((0,))

    if self.num_oov_indices <= 1:
      oov_indices = None
    else:
      oov_start = 1 if mask_token is not None else 0
      oov_end = oov_start + num_oov_indices
      oov_indices = list(range(oov_start, oov_end))

    self._table_handler = table_utils.TableHandler(
        table=self._table,
        oov_tokens=oov_indices,
        use_v1_apis=self._use_v1_apis())

    if vocabulary is not None:
      self.set_vocabulary(vocabulary)
Ejemplo n.º 16
0
    def __init__(self,
                 max_tokens=None,
                 standardize="lower_and_strip_punctuation",
                 split="whitespace",
                 ngrams=None,
                 output_mode="int",
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)

        super().__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._index_lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            mask_token="",
            output_mode=output_mode if output_mode is not None else INT)