def __init__(self,
                 num_tokens=None,
                 output_mode=BINARY,
                 sparse=False,
                 **kwargs):
        # max_tokens is an old name for the num_tokens arg we continue to support
        # because of usage.
        if "max_tokens" in kwargs:
            logging.warning(
                "max_tokens is deprecated, please use num_tokens instead.")
            num_tokens = kwargs["max_tokens"]
            del kwargs["max_tokens"]

        super(CategoryEncoding, self).__init__(**kwargs)

        # 'output_mode' must be one of (COUNT, BINARY)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, BINARY),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        if num_tokens is None:
            raise ValueError(
                "num_tokens must be set to use this layer. If the "
                "number of tokens is not known beforehand, use the "
                "IntegerLookup layer instead.")
        if num_tokens < 1:
            raise ValueError("num_tokens must be >= 1.")

        self.num_tokens = num_tokens
        self.output_mode = output_mode
        self.sparse = sparse
Beispiel #2
0
    def __init__(self,
                 max_tokens=None,
                 output_mode=BINARY,
                 sparse=False,
                 **kwargs):
        # 'output_mode' must be one of (COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, BINARY,
                                                           TFIDF),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        # We need to call super() before we call _add_state_variable().
        combiner = _CategoryEncodingCombiner(
            compute_max_element=max_tokens is None,
            compute_idf=output_mode == TFIDF)
        super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
        base_preprocessing_layer._kpl_gauge.get_cell("V2").set(
            "CategoryEncoding")

        self._max_tokens = max_tokens
        self._output_mode = output_mode
        self._sparse = sparse
        self._called = False

        # We are adding these here instead of in build() since they do not depend
        # on the input shape at all.
        if max_tokens is None:
            self.num_elements = self._add_state_variable(
                name=_NUM_ELEMENTS_NAME,
                shape=(),
                dtype=dtypes.int32,
                initializer=init_ops.zeros_initializer)

        if self._output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = init_ops.zeros_initializer

            self.tf_idf_weights = self._add_state_variable(
                name=_IDF_NAME,
                shape=tensor_shape.TensorShape((max_tokens, )),
                dtype=K.floatx(),
                initializer=initializer)

            self.input_spec = InputSpec(ndim=2)
Beispiel #3
0
    def __init__(self,
                 num_tokens=None,
                 output_mode=MULTI_HOT,
                 sparse=False,
                 **kwargs):
        # max_tokens is an old name for the num_tokens arg we continue to support
        # because of usage.
        if "max_tokens" in kwargs:
            logging.warning(
                "max_tokens is deprecated, please use num_tokens instead.")
            num_tokens = kwargs["max_tokens"]
            del kwargs["max_tokens"]

        super(CategoryEncoding, self).__init__(**kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "CategoryEncoding").set(True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(COUNT, ONE_HOT,
                                                           MULTI_HOT),
                                        layer_name="CategoryEncoding",
                                        arg_name="output_mode")

        if num_tokens is None:
            raise ValueError(
                "num_tokens must be set to use this layer. If the "
                "number of tokens is not known beforehand, use the "
                "IntegerLookup layer instead.")
        if num_tokens < 1:
            raise ValueError("num_tokens must be >= 1.")

        self.num_tokens = num_tokens
        self.output_mode = output_mode
        self.sparse = sparse
Beispiel #4
0
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=True,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT, BINARY,
                                                           TFIDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        self._max_tokens = max_tokens

        # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
        # modes don't have a PAD value, so we only need to reserve one value.
        self._reserved_values = 2 if output_mode == INT else 1

        # In INT mode, the zero value is reserved for padding (per Keras standard
        # padding approaches). In non-INT modes, there is no padding so we can set
        # the OOV value to zero instead of one.
        self._oov_value = 1 if output_mode == INT else 0

        # We always reduce the max token number by 1 to account for the OOV token
        # if it is set. Keras' use of the reserved number 0 for padding tokens,
        # if the output is in INT mode, does not really count as a 'token' for
        # vocabulary purposes, so we only reduce vocab size by 1 here.
        self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None

        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        self._pad_to_max = pad_to_max_tokens
        self._vocab_size = 0
        self._called = False

        super(TextVectorization,
              self).__init__(combiner=_TextVectorizationCombiner(
                  self._max_vocab_size, compute_idf=output_mode == TFIDF),
                             **kwargs)
        self._supports_ragged_inputs = True

        reserve_zero = output_mode in [None, INT]
        self._index_lookup_layer = self._get_index_lookup_class()(
            max_tokens=max_tokens,
            reserve_zero=reserve_zero,
            dtype=dtypes.string)

        # If this layer is configured for string or integer output, we do not
        # create a vectorization layer (as the output is not vectorized).
        if self._output_mode in [None, INT]:
            return

        if max_tokens is not None and self._pad_to_max:
            max_elements = max_tokens
        else:
            max_elements = None
        self._vectorize_layer = self._get_vectorization_class()(
            max_tokens=max_elements, output_mode=self._output_mode)
Beispiel #5
0
  def __init__(self,
               max_tokens=None,
               standardize=LOWER_AND_STRIP_PUNCTUATION,
               split=SPLIT_ON_WHITESPACE,
               ngrams=None,
               output_mode=INT,
               output_sequence_length=None,
               pad_to_max_tokens=True,
               vocabulary=None,
               **kwargs):

    # This layer only applies to string processing, and so should only have
    # a dtype of 'string'.
    if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
      raise ValueError("TextVectorization may only have a dtype of string.")
    elif "dtype" not in kwargs:
      kwargs["dtype"] = dtypes.string

    # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
    layer_utils.validate_string_arg(
        standardize,
        allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
        layer_name="TextVectorization",
        arg_name="standardize",
        allow_none=True,
        allow_callables=True)

    # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
    layer_utils.validate_string_arg(
        split,
        allowable_strings=(SPLIT_ON_WHITESPACE),
        layer_name="TextVectorization",
        arg_name="split",
        allow_none=True,
        allow_callables=True)

    # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, COUNT, BINARY, TFIDF),
        layer_name="TextVectorization",
        arg_name="output_mode",
        allow_none=True)

    # 'ngrams' must be one of (None, int, tuple(int))
    if not (ngrams is None or
            isinstance(ngrams, int) or
            isinstance(ngrams, tuple) and
            all(isinstance(item, int) for item in ngrams)):
      raise ValueError(("`ngrams` must be None, an integer, or a tuple of "
                        "integers. Got %s") % (ngrams,))

    # 'output_sequence_length' must be one of (None, int) and is only
    # set if output_mode is INT.
    if (output_mode == INT and not (isinstance(output_sequence_length, int) or
                                    (output_sequence_length is None))):
      raise ValueError("`output_sequence_length` must be either None or an "
                       "integer when `output_mode` is 'int'. "
                       "Got %s" % output_sequence_length)

    if output_mode != INT and output_sequence_length is not None:
      raise ValueError("`output_sequence_length` must not be set if "
                       "`output_mode` is not 'int'.")

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens < 1:
      raise ValueError("max_tokens must be > 1.")

    self._max_tokens = max_tokens

    # In INT mode, the zero value is reserved for padding (per Keras standard
    # padding approaches). In non-INT modes, there is no padding so we can set
    # the OOV value to zero instead of one.
    self._oov_value = 1 if output_mode == INT else 0

    self._standardize = standardize
    self._split = split
    self._ngrams_arg = ngrams
    if isinstance(ngrams, int):
      self._ngrams = tuple(range(1, ngrams + 1))
    else:
      self._ngrams = ngrams

    self._output_mode = output_mode
    self._output_sequence_length = output_sequence_length
    self._pad_to_max = pad_to_max_tokens
    self._vocab_size = 0

    super(TextVectorization, self).__init__(
        combiner=None,
        **kwargs)
    base_preprocessing_layer.keras_kpl_gauge.get_cell(
        "TextVectorization").set(True)

    mask_token = "" if output_mode in [None, INT] else None
    self._index_lookup_layer = self._get_index_lookup_class()(
        max_tokens=max_tokens,
        mask_token=mask_token,
        vocabulary=vocabulary,
        pad_to_max_tokens=pad_to_max_tokens,
        output_mode=output_mode if output_mode is not None else INT)
Beispiel #6
0
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               pad_to_max_tokens=False,
               **kwargs):

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, `max_tokens` must be greater than 1. "
                       "You passed {}".format(max_tokens))

    if num_oov_indices < 0:
      raise ValueError("`num_oov_indices` must be greater than or equal to 0. "
                       "You passed {}".format(num_oov_indices))

    # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, BINARY, COUNT, TFIDF),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    if invert and output_mode != INT:
      raise ValueError("`output_mode` must be {} when `invert` is true. You "
                       "passed {}".format(INT, output_mode))

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.oov_token = oov_token
    self.mask_token = mask_token
    self.output_mode = output_mode
    self.sparse = sparse
    self.pad_to_max_tokens = pad_to_max_tokens
    self._called = False
    self._vocab_size = 0
    # We need to keep track our current vocab size outside of our layer weights
    # to support a static output shape when `output_mode != INT`. The bincount
    # ops do not set shape on their outputs, which means we have to set it
    # ourselves. We persist the current vocab size as a hidden part of the
    # config when serializing our model.
    if "vocab_size" in kwargs:
      self._vocab_size = kwargs["vocab_size"]
      del kwargs["vocab_size"]

    if max_tokens is not None:
      available_vocab_size = max_tokens - self._token_start_index()
    else:
      available_vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
        **kwargs)

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = dtypes.int64
      self._value_dtype = self.dtype
      self._mask_key = 0
      self._mask_value = mask_token
      default_value = self.oov_token
      oov_indices = None
    else:
      self._key_dtype = self.dtype
      self._value_dtype = dtypes.int64
      self._mask_key = mask_token
      # Masks should map to 0 for int output and be dropped otherwise. Max ints
      # will be dropped from the bincount op.
      self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
      oov_start = self._oov_start_index()
      token_start = self._token_start_index()
      if self.num_oov_indices == 0:
        # If there are no OOV indices, we map OOV tokens to -1 for int output
        # and drop them from bagged output. Max ints will be dropped from the
        # bincount op.
        default_value = -1 if self.output_mode == INT else dtypes.int64.max
        oov_indices = None
      elif self.num_oov_indices == 1:
        # If there is only one OOV index, we can set that index as the default
        # value of the index_lookup table.
        default_value = oov_start
        oov_indices = None
      else:
        # If we hav multiple OOV values, we need to do a further hashing step;
        # to make this easier, we set the OOV value to -1. (This lets us do a
        # vectorized add and cast to boolean to determine locations where we
        # need to do extra hashing.)
        default_value = -1
        oov_indices = list(range(oov_start, token_start))

    if vocabulary is not None and isinstance(vocabulary,
                                             lookup_ops.TextFileInitializer):
      self._table = self._static_table_class()(
          vocabulary, default_value=default_value)
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          mask_token=mask_token,
          oov_tokens=oov_indices,
          use_v1_apis=self._use_v1_apis())
      self.max_tokens = (
          self._table_handler.table_size() + self.num_oov_indices +
          (0 if mask_token is None else 1))
    else:
      self._table = lookup_ops.MutableHashTable(
          key_dtype=self._key_dtype,
          value_dtype=self._value_dtype,
          default_value=default_value,
          name=(self._name + "_index_table"))
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          oov_tokens=oov_indices,
          use_v1_apis=self._use_v1_apis())
      if vocabulary is not None:
        self.set_vocabulary(vocabulary)

    if self.output_mode == TFIDF:
      # The TF-IDF weight may have a (None,) tensorshape. This creates
      # a 1D variable with arbitrary shape, which we can assign any weight to
      # so long as it has 1 dimension. In order to properly initialize this
      # weight in Keras, we need to provide a custom callable initializer which
      # does not depend on the shape of the weight (as all other initializers
      # do) since the weight is not known. Hence the lambda shape, dtype: [0].
      if not self.pad_to_max_tokens or max_tokens is None:
        initializer = lambda shape, dtype: [0]
      else:
        initializer = init_ops.zeros_initializer

      # We are adding these here instead of in build() since they do not depend
      # on the input shape at all.
      idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
      self.tf_idf_weights = self._add_state_variable(
          name="idf",
          shape=tensor_shape.TensorShape(idf_shape),
          dtype=K.floatx(),
          initializer=initializer)

    tracked_table = self._add_trackable(self._table, trainable=False)
    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tensor_shape.TensorShape((0,))
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, max_tokens must be greater than 1. "
                             "You passed %s" % (max_tokens, ))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than 0. You passed "
                "%s" % (num_oov_indices, ))

        if invert and num_oov_indices != 1:
            raise ValueError(
                "`num_oov_tokens` must be 1 when `invert` is True.")

        # 'output_mode' must be one of (INT, BINARY, COUNT)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        if max_tokens is not None:
            num_mask_tokens = (0 if mask_token is None else 1)
            vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
        else:
            vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size, self.mask_token),
                                          **kwargs)

        self._output_dtype = dtypes.int64

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = self._output_dtype
            value_dtype = self.dtype
            oov_value = self.oov_token
        else:
            self._key_dtype = self.dtype
            value_dtype = self._output_dtype
            oov_value = self._oov_value

        self._table = lookup_ops.MutableHashTable(key_dtype=self._key_dtype,
                                                  value_dtype=value_dtype,
                                                  default_value=oov_value,
                                                  name=(self._name +
                                                        "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        if self.num_oov_indices <= 1:
            oov_indices = None
        else:
            oov_start = 1 if mask_token is not None else 0
            oov_end = oov_start + num_oov_indices
            oov_indices = list(range(oov_start, oov_end))

        self._table_handler = table_utils.TableHandler(
            table=self._table,
            oov_tokens=oov_indices,
            use_v1_apis=self._use_v1_apis())

        if vocabulary is not None:
            self.set_vocabulary(vocabulary)
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=True,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=[LOWER_AND_STRIP_PUNCTUATION],
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=[SPLIT_ON_WHITESPACE],
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(
            output_mode,
            allowable_strings=[INT, COUNT, BINARY, TFIDF],
            layer_name="TextVectorization",
            arg_name="output_mode",
            allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        self._max_tokens = max_tokens

        # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
        # modes don't have a PAD value, so we only need to reserve one value.
        self._reserved_values = 2 if output_mode == INT else 1

        # In INT mode, the zero value is reserved for padding (per Keras standard
        # padding approaches). In non-INT modes, there is no padding so we can set
        # the OOV value to zero instead of one.
        self._oov_value = 1 if output_mode == INT else 0

        # We always reduce the max token number by 1 to account for the OOV token
        # if it is set. Keras' use of the reserved number 0 for padding tokens,
        # if the output is in INT mode, does not really count as a 'token' for
        # vocabulary purposes, so we only reduce vocab size by 1 here.
        self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None

        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        self._pad_to_max = pad_to_max_tokens
        self._vocab_size = 0
        self._called = False

        super(TextVectorization,
              self).__init__(combiner=_TextVectorizationCombiner(
                  self._max_vocab_size, compute_idf=output_mode == TFIDF),
                             **kwargs)

        self._table = lookup_ops.MutableHashTable(
            key_dtype=dtypes.string,
            value_dtype=dtypes.int64,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))

        def fail(_):
            raise NotImplementedError(
                "Saving is not yet supported for TextVectorization layers.")

        self._table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access

        tracked_table = self._add_trackable(self._table, trainable=False)

        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        # If this layer is configured for string or integer output, we do not
        # create a vectorization layer (as the output is not vectorized).
        if self._output_mode in [None, INT]:
            return

        if max_tokens is not None and self._pad_to_max:
            vectorize_max_tokens = max_tokens
        else:
            vectorize_max_tokens = None
        self._vectorize_layer = self._get_vectorization_class()(
            max_tokens=vectorize_max_tokens, output_mode=self._output_mode)
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               pad_to_max_tokens=False,
               **kwargs):
    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, `max_tokens` must be greater than 1. "
                       "You passed {}".format(max_tokens))

    if num_oov_indices < 0:
      raise ValueError("`num_oov_indices` must be greater than or equal to 0. "
                       "You passed {}".format(num_oov_indices))

    # Support deprecated names for output_modes.
    if output_mode == "binary":
      output_mode = MULTI_HOT
    if output_mode == "tf-idf":
      output_mode = TF_IDF
    # 'output_mode' must be one of (INT, MULTI_HOT, COUNT, TF_IDF)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, MULTI_HOT, COUNT, TF_IDF),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    if invert and output_mode != INT:
      raise ValueError("`output_mode` must be {} when `invert` is true. You "
                       "passed {}".format(INT, output_mode))

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.output_mode = output_mode
    self.sparse = sparse
    self.pad_to_max_tokens = pad_to_max_tokens
    self._called = False

    # A note on vocab_size: we need to always keep a non-Tensor representation
    # of vocab_size around to use in graph building. Because we might be
    # in a tf.function, we can't rely on evaluating the actual tables to
    # find the value either.
    self._vocab_size = None
    # We need to keep track our current vocab size outside of our layer weights
    # to support a static output shape when `output_mode != INT`. The bincount
    # ops do not set shape on their outputs, which means we have to set it
    # ourselves. We persist the current vocab size as a hidden part of the
    # config when serializing our model.
    if "vocabulary_size" in kwargs:
      self._vocab_size = kwargs["vocabulary_size"]
      del kwargs["vocabulary_size"]

    restore_from_static_table = kwargs.pop("has_static_table", False)

    # Make sure the mask token and oov token are truly of the dtype we want. We
    # can ignore strings here, because they have only one dtype.
    dtype = kwargs["dtype"]
    if dtype == dtypes.int32:
      mask_token = None if mask_token is None else np.int32(mask_token)
      oov_token = None if oov_token is None else np.int32(oov_token)
    elif dtype == dtypes.int64:
      mask_token = None if mask_token is None else np.int64(mask_token)
      oov_token = None if oov_token is None else np.int64(oov_token)
    self.mask_token = mask_token
    self.oov_token = oov_token

    if max_tokens is not None:
      available_vocab_size = max_tokens - self._token_start_index()
    else:
      available_vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TF_IDF)),
        **kwargs)

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = dtypes.int64
      self._value_dtype = self.dtype
      self._mask_key = 0
      self._mask_value = mask_token
      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
      default_value = self.oov_token
      oov_indices = None
    else:
      self._key_dtype = self.dtype
      self._value_dtype = dtypes.int64
      self._mask_key = mask_token
      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
      # Masks should map to 0 for int output and be dropped otherwise. Max ints
      # will be dropped from the bincount op.
      self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
      oov_start = self._oov_start_index()
      token_start = self._token_start_index()
      if self.num_oov_indices == 0:
        # If there are no OOV indices, we map OOV tokens to -1 and error out
        # during call if we find a negative index.
        default_value = -1
        oov_indices = None
      elif self.num_oov_indices == 1:
        # If there is only one OOV index, we can set that index as the default
        # value of the index_lookup table.
        default_value = oov_start
        oov_indices = None
      else:
        # If we hav multiple OOV values, we need to do a further hashing step;
        # to make this easier, we set the OOV value to -1. (This lets us do a
        # vectorized add and cast to boolean to determine locations where we
        # need to do extra hashing.)
        default_value = -1
        oov_indices = list(range(oov_start, token_start))

    self._static_vocabulary_path = None
    has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str))
    if has_vocab_path or restore_from_static_table:
      self._has_static_table = True
      if vocabulary is None:
        # If we're restoring a layer that was saved with a static table
        # initializer, we create a fake initializer object to let the code
        # progress. The savedmodel restoration code will handle restoring
        # the actual data.
        initializer = _NullInitializer(self._key_dtype, self._value_dtype)
      else:
        if not gfile.Exists(vocabulary):
          raise ValueError("Vocabulary file %s does not exist." % (vocabulary,))
        self._static_vocabulary_path = vocabulary
        num_tokens = table_utils.num_tokens_in_file(vocabulary)
        self._vocab_size = self._token_start_index() + num_tokens

        initializer = lookup_ops.TextFileInitializer(
            filename=vocabulary,
            key_dtype=self._key_dtype,
            key_index=key_index,
            value_dtype=self._value_dtype,
            value_index=value_index,
            value_index_offset=self._token_start_index())

      self._table = lookup_ops.StaticHashTable(
          initializer, default_value=default_value)
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          mask_token=self._mask_key if self.mask_token is not None else None,
          mask_value=self._mask_value,
          oov_tokens=oov_indices)

      tracked_table = self._add_trackable(self._table, trainable=False)

    else:
      self._has_static_table = False
      self._table = lookup_ops.MutableHashTable(
          key_dtype=self._key_dtype,
          value_dtype=self._value_dtype,
          default_value=default_value,
          name=(self._name + "_index_table"))
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          oov_tokens=oov_indices)
      if vocabulary is not None:
        self.set_vocabulary(vocabulary)
      tracked_table = self._add_trackable(self._table, trainable=False)

    if self.output_mode == TF_IDF:
      # The TF-IDF weight may have a (None,) tensorshape. This creates
      # a 1D variable with arbitrary shape, which we can assign any weight to
      # so long as it has 1 dimension. In order to properly initialize this
      # weight in Keras, we need to provide a custom callable initializer which
      # does not depend on the shape of the weight (as all other initializers
      # do) since the weight is not known. Hence the lambda shape, dtype: [0].
      if not self.pad_to_max_tokens or max_tokens is None:
        initializer = lambda shape, dtype: [0]
      else:
        initializer = init_ops.zeros_initializer

      # We are adding these here instead of in build() since they do not depend
      # on the input shape at all.
      idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
      self.tf_idf_weights = self._add_state_variable(
          name="idf",
          shape=tensor_shape.TensorShape(idf_shape),
          dtype=backend.floatx(),
          initializer=initializer)

    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tensor_shape.TensorShape((0,))
Beispiel #10
0
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               pad_to_max_tokens=False,
               **kwargs):

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, max_tokens must be greater than 1. "
                       "You passed %s" % (max_tokens,))

    if num_oov_indices < 0:
      raise ValueError(
          "num_oov_indices must be greater than or equal to 0. You passed %s" %
          (num_oov_indices,))

    if invert and num_oov_indices != 1:
      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")

    # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, BINARY, COUNT, TFIDF),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.oov_token = oov_token
    self.mask_token = mask_token
    self.output_mode = output_mode
    self.sparse = sparse
    self.pad_to_max_tokens = pad_to_max_tokens
    self._called = False

    # If there is only one OOV bucket, we can determine the OOV value (either 0
    # or 1 depending on whether 0 is reserved) and set that as the default
    # value of the index_lookup table. If we hav multiple OOV values, we need to
    # do a further hashing step; to make this easier, we set the OOV value to
    # -1. (This lets us do a vectorized add and cast to boolean to determine
    # locations where we need to do extra hashing.)
    if self.num_oov_indices == 1:
      self._oov_value = 0 if mask_token is None else 1
    else:
      self._oov_value = -1

    if max_tokens is not None:
      num_mask_tokens = (0 if mask_token is None else 1)
      vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
    else:
      vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(
            vocab_size=vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
        **kwargs)

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = dtypes.int64
      self._value_dtype = self.dtype
      oov_value = self.oov_token
    else:
      self._key_dtype = self.dtype
      self._value_dtype = dtypes.int64
      oov_value = self._oov_value

    if self.num_oov_indices <= 1:
      oov_indices = None
    else:
      oov_start = 1 if mask_token is not None else 0
      oov_end = oov_start + num_oov_indices
      oov_indices = list(range(oov_start, oov_end))

    if vocabulary is not None and isinstance(vocabulary,
                                             lookup_ops.TextFileInitializer):
      self._table = self._static_table_class()(
          vocabulary, default_value=oov_value)
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          mask_token=mask_token,
          oov_tokens=oov_indices,
          use_v1_apis=self._use_v1_apis())
      self.max_tokens = (
          self._table_handler.vocab_size() + self.num_oov_indices +
          (0 if mask_token is None else 1))
    else:
      self._table = lookup_ops.MutableHashTable(
          key_dtype=self._key_dtype,
          value_dtype=self._value_dtype,
          default_value=oov_value,
          name=(self._name + "_index_table"))
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          oov_tokens=oov_indices,
          use_v1_apis=self._use_v1_apis())
      if vocabulary is not None:
        self.set_vocabulary(vocabulary)

    if self.output_mode == TFIDF:
      # The TF-IDF weight may have a (None,) tensorshape. This creates
      # a 1D variable with arbitrary shape, which we can assign any weight to
      # so long as it has 1 dimension. In order to properly initialize this
      # weight in Keras, we need to provide a custom callable initializer which
      # does not depend on the shape of the weight (as all other initializers
      # do) since the weight is not known. Hence the lambda shape, dtype: [0].
      if not self.pad_to_max_tokens or max_tokens is None:
        initializer = lambda shape, dtype: [0]
      else:
        initializer = init_ops.zeros_initializer

      # We are adding these here instead of in build() since they do not depend
      # on the input shape at all.
      idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
      self.tf_idf_weights = self._add_state_variable(
          name="idf",
          shape=tensor_shape.TensorShape(idf_shape),
          dtype=K.floatx(),
          initializer=initializer)

    tracked_table = self._add_trackable(self._table, trainable=False)
    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tensor_shape.TensorShape((0,))
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        vocabulary_size = 0
        # IndexLookup needs to keep track the current vocab size outside of its
        # layer weights. We persist it as a hidden part of the config during
        # serialization.
        if "vocabulary_size" in kwargs:
            vocabulary_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        super(TextVectorization, self).__init__(combiner=None, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._index_lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            output_mode=output_mode if output_mode is not None else INT,
            vocabulary_size=vocabulary_size)