Example #1
0
    def set_vocabulary(self, vocab, idf_weights=None):
        """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary and idf weights for this layer directly,
    instead of analyzing a dataset through 'adapt'. It should be used whenever
    the vocab (and optionally document frequency) information is already known.
    If vocabulary data is already present in the layer, this method will replace
    it.

    Args:
      vocab: An array of hashable tokens.
      idf_weights: An array of inverse document frequency weights with equal
        length to vocab. Only necessary if the layer output_mode is TFIDF.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
        input data is missing.
      RuntimeError: If the vocabulary cannot be set when this function is
        called. This happens when "binary", "count", and "tfidf" modes,
        if "pad_to_max_tokens" is False and the layer itself has already been
        called.
    """
        if self.output_mode != TFIDF and idf_weights is not None:
            raise ValueError(
                "`idf_weights` should only be set if output_mode is "
                "TFIDF. output_mode is {}.".format(self.output_mode))

        if (self.output_mode in [BINARY, COUNT, TFIDF] and self._called
                and not self.pad_to_max_tokens):
            raise RuntimeError(
                "When using {} mode and `pad_to_max_tokens` is "
                "False, the vocabulary cannot be changed after the "
                "layer is called.".format(self.output_mode))

        should_have_mask = self.mask_token is not None
        has_mask = vocab[0] == self.mask_token
        oov_start = 1 if should_have_mask else 0

        should_have_oov = (self.num_oov_indices > 0)
        if should_have_oov:
            oov_end = oov_start + self.num_oov_indices
            expected_oov = [self.oov_token] * self.num_oov_indices
            has_oov = vocab[oov_start:oov_end] == expected_oov
            # If we get a numpy array, then has_oov may end up being a numpy array
            # instead of a bool. Fix this by collapsing the variable if it's not bool.
            if not isinstance(has_oov, bool):
                has_oov = any(has_oov)
        else:
            has_oov = False

        if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
                "included in the provided vocabulary. The passed vocabulary has the "
                "correct mask token `{mask}` at index 0, but does not have the OOV "
                "token `{oov}` in indices [{start}:{end}]. Instead, we found "
                "`{found}`. Was this vocabulary generated by a layer with "
                "incompatible settings?".format(
                    mask=self.mask_token,
                    oov=self.oov_token,
                    start=oov_start,
                    end=oov_end,
                    found=vocab[oov_start:oov_end]))

        if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
                "included in the provided vocabulary. The passed vocabulary has the "
                "correct OOV token `{oov}` at indices [{start}:{end}], but does not "
                "have the mask token `{mask}` in index 0. Instead, we found "
                "`{found}`. Was this vocabulary generated by a layer with "
                "incompatible settings?".format(mask=self.mask_token,
                                                oov=self.oov_token,
                                                start=oov_start,
                                                end=oov_end,
                                                found=vocab[0]))

        found_special_tokens = has_oov or has_mask

        if found_special_tokens:
            tokens = vocab[self._num_special_tokens:]
        else:
            tokens = vocab

        repeated_tokens = table_utils.find_repeated_tokens(tokens)
        if repeated_tokens:
            raise ValueError(
                "The passed vocabulary has at least one repeated "
                "term. Please uniquify your dataset. The repeated terms "
                "are {}".format(repeated_tokens))

        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token {} was found in the passed "
                "vocabulary at index {}. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer.".format(
                    self.mask_token, tokens.index(self.mask_token)))
        if self.oov_token in tokens:
            raise ValueError(
                "Reserved OOV token {} was found in the passed "
                "vocabulary at index {}. Please either remove the "
                "reserved token from the vocabulary or change the "
                "OOV token for this layer.".format(
                    self.oov_token, tokens.index(self.oov_token)))

        self._vocab_size = len(tokens) + self._num_special_tokens
        if self.max_tokens is not None and self._vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is {}, max vocab size is {}.".format(
                    self._vocab_size, self.max_tokens))

        if self.output_mode == TFIDF:
            if idf_weights is None:
                raise ValueError(
                    "`idf_weights` must be set if output_mode is TFIDF")
            if len(vocab) != len(idf_weights):
                raise ValueError(
                    "`idf_weights` must be the same length as vocab. "
                    "len(idf_weights) is {}, len(vocab) is {}".format(
                        len(vocab), len(idf_weights)))
            idf_weights = self._convert_to_ndarray(idf_weights)
            if idf_weights.ndim != 1:
                raise ValueError(
                    "TF-IDF data must be a 1-index array, but received {}".
                    format(type(idf_weights)))

        # We add the non-special vocab tokens and optionally the mask_token to our
        # hash table. OOV tokens are handled with the hash table default value and
        # not added directly.
        self._table_handler.clear()
        start_index = self._num_special_tokens
        indices = np.arange(start_index,
                            len(tokens) + start_index,
                            dtype=np.int64)
        if self.invert:
            self._table_handler.insert(indices, tokens)
            if self.mask_token is not None:
                self._table_handler.insert([0], [self.mask_token])
        else:
            self._table_handler.insert(tokens, indices)
            if self.mask_token is not None:
                self._table_handler.insert([self.mask_token], [0])

        if self.output_mode == TFIDF:
            # If the passed vocabulary has no special tokens, we need to pad the front
            # of idf_weights. We don't have real document frequencies for these tokens
            # so we will use an average of all idf_weights passed in as a reasonable
            # default.
            if found_special_tokens:
                front_padding = 0
                front_padding_value = 0
            else:
                front_padding = self._num_special_tokens
                front_padding_value = np.average(idf_weights)
            # If pad_to_max_tokens is true, and max_tokens is greater than our total
            # vocab size, we need to pad the back of idf_weights with zeros as well.
            back_padding_value = 0
            if self.pad_to_max_tokens and self.max_tokens is not None:
                back_padding = self.max_tokens - front_padding - len(
                    idf_weights)
            else:
                back_padding = 0
            idf_weights = np.pad(idf_weights, (front_padding, back_padding),
                                 "constant",
                                 constant_values=(front_padding_value,
                                                  back_padding_value))
            K.set_value(self.tf_idf_weights, idf_weights)
Example #2
0
  def set_vocabulary(self, vocabulary, idf_weights=None):
    """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary and idf weights for this layer directly,
    instead of analyzing a dataset through `adapt`. It should be used whenever
    the vocab (and optionally document frequency) information is already known.
    If vocabulary data is already present in the layer, this method will replace
    it.

    Args:
      vocabulary: An array, numpy array, or tensor of hashable tokens.
      idf_weights: An array, numpy array, or tensor of inverse document
        frequency weights with equal length to vocab. Only necessary if the
        layer output_mode is TF_IDF.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
        input data is missing.
      RuntimeError: If the vocabulary cannot be set when this function is
        called. This happens when `"multi_hot"`, `"count"`, and `"tfidf"` modes,
        if `pad_to_max_tokens` is False and the layer itself has already been
        called.
      RuntimeError: If a tensor vocabulary is passed outside of eager execution.
    """
    if self._has_static_table:
      raise RuntimeError("Layer {} was created with a static file-based table "
                         "because a file path was passed to the layer "
                         "init. Layers created with static file-based tables "
                         "do not support changing the vocabulary after "
                         "creation.".format(self.name))

    if self.output_mode != TF_IDF and idf_weights is not None:
      raise ValueError("`idf_weights` should only be set if output_mode is "
                       "TF_IDF. output_mode is {}.".format(self.output_mode))

    if (self.output_mode in [MULTI_HOT, COUNT, TF_IDF] and self._called and
        not self.pad_to_max_tokens):
      raise RuntimeError("When using {} mode and `pad_to_max_tokens` is "
                         "False, the vocabulary cannot be changed after the "
                         "layer is called.".format(self.output_mode))

    if not tf.executing_eagerly() and (tf.is_tensor(vocabulary) or
                                            tf.is_tensor(idf_weights)):
      raise RuntimeError(
          "Cannot set a tensor vocabulary on {} layer {} when not executing "
          "eagerly. Create this layer or call `set_vocabulary` outside of "
          "any `tf.function`s and with eager execution enabled.".format(
              self.__class__.__name__, self.name))

    # TODO(mattdangerw): for better performance we should rewrite this entire
    # function to operate on tensors and convert vocabulary to a tensor here.
    if tf.is_tensor(vocabulary):
      vocabulary = self._tensor_vocab_to_numpy(vocabulary)
    if tf.is_tensor(idf_weights):
      idf_weights = idf_weights.numpy()

    oov_start = self._oov_start_index()
    token_start = self._token_start_index()
    should_have_mask = (oov_start > 0)
    has_mask = should_have_mask and vocabulary[0] == self.mask_token

    should_have_oov = (self.num_oov_indices > 0)
    expected_oov = [self.oov_token] * self.num_oov_indices
    found_oov = vocabulary[oov_start:token_start]
    has_oov = should_have_oov and found_oov == expected_oov
    # If we get a numpy array, then has_oov may end up being a numpy array
    # instead of a bool. Fix this by collapsing the variable if it's not bool.
    if not isinstance(has_oov, bool):
      has_oov = any(has_oov)

    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
      raise ValueError(
          "Invalid vocabulary format. The layer was created with "
          "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
          "included in the provided vocabulary. The passed vocabulary has the "
          "correct mask token `{mask}` at index 0, but does not have the OOV "
          "token `{oov}` in indices [{start}:{end}]. Instead, we found "
          "`{found}`. Was this vocabulary generated by a layer with "
          "incompatible settings?".format(
              mask=self.mask_token,
              oov=self.oov_token,
              start=oov_start,
              end=token_start,
              found=found_oov))

    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
      raise ValueError(
          "Invalid vocabulary format. The layer was created with "
          "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
          "included in the provided vocabulary. The passed vocabulary has the "
          "correct OOV token `{oov}` at indices [{start}:{end}], but does not "
          "have the mask token `{mask}` in index 0. Instead, we found "
          "`{found}`. Was this vocabulary generated by a layer with "
          "incompatible settings?".format(
              mask=self.mask_token,
              oov=self.oov_token,
              start=oov_start,
              end=token_start,
              found=vocabulary[0]))

    found_special_tokens = has_oov or has_mask
    if found_special_tokens:
      tokens = vocabulary[token_start:]
    else:
      tokens = vocabulary

    repeated_tokens = table_utils.find_repeated_tokens(tokens)
    if repeated_tokens:
      raise ValueError("The passed vocabulary has at least one repeated "
                       "term. Please uniquify your dataset. The repeated terms "
                       "are {}".format(repeated_tokens))

    if self.mask_token in tokens:
      raise ValueError("Reserved mask token {} was found in the passed "
                       "vocabulary at index {}. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "mask token for this layer.".format(
                           self.mask_token, tokens.index(self.mask_token)))
    if self.oov_token in tokens:
      raise ValueError("Reserved OOV token {} was found in the passed "
                       "vocabulary at index {}. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "OOV token for this layer.".format(
                           self.oov_token, tokens.index(self.oov_token)))

    self._vocab_size = token_start + len(tokens)
    if self.max_tokens is not None and self._vocab_size > self.max_tokens:
      raise ValueError(
          "Attempted to set a vocabulary larger than the maximum vocab size. "
          "Passed vocab size is {}, max vocab size is {}.".format(
              self._vocab_size, self.max_tokens))

    if self.output_mode == TF_IDF:
      if idf_weights is None:
        raise ValueError("`idf_weights` must be set if output_mode is TF_IDF")
      if len(vocabulary) != len(idf_weights):
        raise ValueError("`idf_weights` must be the same length as vocabulary. "
                         "len(idf_weights) is {}, len(vocabulary) is {}".format(
                             len(vocabulary), len(idf_weights)))
      idf_weights = self._convert_to_ndarray(idf_weights)
      if idf_weights.ndim != 1:
        raise ValueError(
            "TF-IDF data must be a 1-index array, but received {}".format(
                type(idf_weights)))

    # We add the non-special vocab tokens and optionally the mask_token to our
    # hash table. OOV tokens are handled with the hash table default value and
    # not added directly.
    self._table_handler.clear()
    indices = np.arange(token_start, len(tokens) + token_start, dtype=np.int64)
    if self.invert:
      self._table_handler.insert(indices, tokens)
    else:
      self._table_handler.insert(tokens, indices)
    if self.mask_token is not None:
      self._table_handler.insert([self._mask_key], [self._mask_value])

    if self.output_mode == TF_IDF:
      # If the passed vocabulary has no special tokens, we need to pad the front
      # of idf_weights. We don't have real document frequencies for these tokens
      # so we will use an average of all idf_weights passed in as a reasonable
      # default.
      if found_special_tokens:
        front_padding = 0
        front_padding_value = 0
      else:
        front_padding = token_start
        front_padding_value = np.average(idf_weights)
      # If pad_to_max_tokens is true, and max_tokens is greater than our total
      # vocab size, we need to pad the back of idf_weights with zeros as well.
      back_padding_value = 0
      if self.pad_to_max_tokens and self.max_tokens is not None:
        back_padding = self.max_tokens - front_padding - len(idf_weights)
      else:
        back_padding = 0
      idf_weights = np.pad(
          idf_weights, (front_padding, back_padding),
          "constant",
          constant_values=(front_padding_value, back_padding_value))
      backend.set_value(self.tf_idf_weights, idf_weights)