Beispiel #1
0
    def _set_inverse_vocabulary(self, vocab):
        """Sets vocabulary data for this layer when inverse is True."""
        table_utils.validate_vocabulary_is_unique(vocab)

        should_have_mask = self.mask_token is not None
        has_mask = vocab[0] == self.mask_token

        insert_special_tokens = should_have_mask and not has_mask
        special_tokens = [] if self.mask_token is None else [self.mask_token]

        num_special_tokens = len(special_tokens)
        tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer." %
                (self.mask_token, tokens.index(self.mask_token)))

        if insert_special_tokens:
            total_vocab_size = len(vocab) + num_special_tokens
        else:
            total_vocab_size = len(vocab)
        if self.max_tokens is not None and total_vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is %s, max vocab size is %s." %
                (total_vocab_size, self.max_tokens))

        start_index = num_special_tokens if insert_special_tokens else 0
        values = np.arange(start_index,
                           len(vocab) + start_index,
                           dtype=np.int64)

        self._table_handler.clear()
        self._table_handler.insert(values, vocab)

        if insert_special_tokens and num_special_tokens > 0:
            special_token_values = np.arange(num_special_tokens,
                                             dtype=np.int64)
            self._table_handler.insert(special_token_values, special_tokens)
        return total_vocab_size
Beispiel #2
0
    def _set_forward_vocabulary(self, vocab, idf_weights=None):
        """Sets vocabulary data for this layer when inverse is False."""
        table_utils.validate_vocabulary_is_unique(vocab)

        should_have_mask = self.mask_token is not None
        has_mask = vocab[0] == self.mask_token
        oov_start = 1 if should_have_mask else 0

        should_have_oov = (self.num_oov_indices > 0) and not self.invert
        if should_have_oov:
            oov_end = oov_start + self.num_oov_indices
            expected_oov = [self.oov_token] * self.num_oov_indices
            has_oov = vocab[oov_start:oov_end] == expected_oov
            # If we get a numpy array, then has_oov may end up being a numpy array
            # instead of a bool. Fix this by collapsing the variable if it's not bool.
            if not isinstance(has_oov, bool):
                has_oov = any(has_oov)
        else:
            has_oov = False

        if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token=%s` and `oov_token=%s`. These tokens should"
                " be included in the provided vocabulary. "
                "The passed vocabulary has the correct mask token `%s` "
                "at index 0, but does not have the OOV token `%s` in "
                "indices [%s:%s]. Instead, we found `%s`. Was this "
                "vocabulary generated by a layer with incompatible "
                "settings?" %
                (self.mask_token, self.oov_token, self.mask_token,
                 self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end]))

        if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token=%s` and `oov_token=%s`. These tokens should "
                "be included in the provided vocabulary. "
                "The passed vocabulary has the correct OOV token `%s` at "
                "indices [%s:%s], but does not have the mask token `%s` in "
                "index 0. Instead, we found `%s`. Was this vocabulary "
                "generated by a layer with incompatible settings?" %
                (self.mask_token, self.oov_token, self.oov_token, oov_start,
                 oov_end, self.mask_token, vocab[0]))

        special_tokens = [] if self.mask_token is None else [self.mask_token]
        special_tokens.extend([self.oov_token] * self.num_oov_indices)

        insert_special_tokens = special_tokens and not has_oov and not has_mask
        num_special_tokens = len(special_tokens)
        tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer." %
                (self.mask_token, tokens.index(self.mask_token)))
        if self.oov_token in tokens:
            raise ValueError(
                "Reserved OOV token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "OOV token for this layer." %
                (self.oov_token, tokens.index(self.oov_token)))

        total_vocab_size = len(tokens) + num_special_tokens
        if self.max_tokens is not None and total_vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is %s, max vocab size is %s." %
                (total_vocab_size, self.max_tokens))

        self._table_handler.clear()
        if insert_special_tokens:
            start_index = num_special_tokens
            values = np.arange(start_index,
                               len(tokens) + start_index,
                               dtype=np.int64)
            self._table_handler.insert(tokens, values)
            special_token_values = np.arange(num_special_tokens,
                                             dtype=np.int64)
            self._table_handler.insert(special_tokens, special_token_values)
        else:
            values = np.arange(len(vocab), dtype=np.int64)
            self._table_handler.insert(vocab, values)

        if self.output_mode == TFIDF:
            if idf_weights is None:
                raise ValueError(
                    "idf_weights must be set if output_mode is TFIDF")
            if len(vocab) != len(idf_weights):
                raise ValueError(
                    "idf_weights must be the same length as vocab. "
                    "len(idf_weights) is %s, len(vocab) is %s" %
                    (len(vocab), len(idf_weights)))
            idf_weights = self._convert_to_ndarray(idf_weights)
            if idf_weights.ndim != 1:
                raise ValueError(
                    "TF-IDF data must be a 1-index array, but received {}".
                    format(type(idf_weights)))

            # If we inserted special tokens into the vocab, we need to pad the front
            # of idf_weights. We don't have real document frequencies for these tokens
            # so we will use an average of all idf_weights passed in as a reasonable
            # default.
            if insert_special_tokens:
                front_padding = num_special_tokens
                front_padding_value = np.average(idf_weights)
            else:
                front_padding = 0
                front_padding_value = 0
            # If pad_to_max_tokens is true, and max_tokens is greater than our total
            # vocab size, we need to pad the back of idf_weights with zeros as well.
            back_padding_value = 0
            if self.pad_to_max_tokens and self.max_tokens is not None:
                back_padding = self.max_tokens - total_vocab_size
            else:
                back_padding = 0
            idf_weights = np.pad(idf_weights, (front_padding, back_padding),
                                 "constant",
                                 constant_values=(front_padding_value,
                                                  back_padding_value))
            K.set_value(self.tf_idf_weights, idf_weights)

        return total_vocab_size
Beispiel #3
0
  def _set_forward_vocabulary(self, vocab):
    """Sets vocabulary data for this layer when inverse is False."""
    table_utils.validate_vocabulary_is_unique(vocab)

    should_have_mask = self.mask_token is not None
    has_mask = vocab[0] == self.mask_token
    oov_start = 1 if should_have_mask else 0

    should_have_oov = (self.num_oov_indices > 0) and not self.invert
    if should_have_oov:
      oov_end = oov_start + self.num_oov_indices
      expected_oov = [self.oov_token] * self.num_oov_indices
      has_oov = vocab[oov_start:oov_end] == expected_oov
      # If we get a numpy array, then has_oov may end up being a numpy array
      # instead of a bool. Fix this by collapsing the variable if it's not bool.
      if not isinstance(has_oov, bool):
        has_oov = any(has_oov)
    else:
      has_oov = False

    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
      raise ValueError("Invalid vocabulary format. The layer was created with "
                       "`mask_token=%s` and `oov_token=%s`. These tokens should"
                       " be included in the provided vocabulary. "
                       "The passed vocabulary has the correct mask token `%s` "
                       "at index 0, but does not have the OOV token `%s` in "
                       "indices [%s:%s]. Instead, we found `%s`. Was this "
                       "vocabulary generated by a layer with incompatible "
                       "settings?" %
                       (self.mask_token, self.oov_token,
                        self.mask_token, self.oov_token, oov_start, oov_end,
                        vocab[oov_start:oov_end]))

    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
      raise ValueError(
          "Invalid vocabulary format. The layer was created with "
          "`mask_token=%s` and `oov_token=%s`. These tokens should "
          "be included in the provided vocabulary. "
          "The passed vocabulary has the correct OOV token `%s` at "
          "indices [%s:%s], but does not have the mask token `%s` in "
          "index 0. Instead, we found `%s`. Was this vocabulary "
          "generated by a layer with incompatible settings?" %
          (self.mask_token, self.oov_token, self.oov_token,
           oov_start, oov_end, self.mask_token, vocab[0]))

    insert_special_tokens = not has_oov and not has_mask

    special_tokens = [] if self.mask_token is None else [self.mask_token]
    special_tokens.extend([self.oov_token] * self.num_oov_indices)

    num_special_tokens = len(special_tokens)
    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
    if self.mask_token in tokens:
      raise ValueError("Reserved mask token %s was found in the passed "
                       "vocabulary at index %s. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "mask token for this layer." %
                       (self.mask_token, tokens.index(self.mask_token)))
    if self.oov_token in tokens:
      raise ValueError("Reserved OOV token %s was found in the passed "
                       "vocabulary at index %s. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "OOV token for this layer." %
                       (self.oov_token, tokens.index(self.oov_token)))

    if insert_special_tokens:
      total_vocab_size = len(vocab) + num_special_tokens
    else:
      total_vocab_size = len(vocab)
    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
      raise ValueError(
          "Attempted to set a vocabulary larger than the maximum vocab size. "
          "Passed vocab size is %s, max vocab size is %s." %
          (total_vocab_size, self.max_tokens))

    start_index = num_special_tokens
    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)

    self._table_handler.clear()
    self._table_handler.insert(vocab, values)

    if insert_special_tokens and num_special_tokens > 0:
      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
      self._table_handler.insert(special_tokens, special_token_values)
    return total_vocab_size