コード例 #1
0
    def _set_inverse_vocabulary(self, vocab):
        """Sets vocabulary data for this layer when inverse is True."""
        table_utils.validate_vocabulary_is_unique(vocab)

        should_have_mask = self.mask_token is not None
        has_mask = vocab[0] == self.mask_token

        insert_special_tokens = should_have_mask and not has_mask
        special_tokens = [] if self.mask_token is None else [self.mask_token]

        num_special_tokens = len(special_tokens)
        tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer." %
                (self.mask_token, tokens.index(self.mask_token)))

        if insert_special_tokens:
            total_vocab_size = len(vocab) + num_special_tokens
        else:
            total_vocab_size = len(vocab)
        if self.max_tokens is not None and total_vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is %s, max vocab size is %s." %
                (total_vocab_size, self.max_tokens))

        start_index = num_special_tokens if insert_special_tokens else 0
        values = np.arange(start_index,
                           len(vocab) + start_index,
                           dtype=np.int64)

        self._table_handler.clear()
        self._table_handler.insert(values, vocab)

        if insert_special_tokens and num_special_tokens > 0:
            special_token_values = np.arange(num_special_tokens,
                                             dtype=np.int64)
            self._table_handler.insert(special_token_values, special_tokens)
        return total_vocab_size
コード例 #2
0
    def __init__(self,
                 max_tokens=None,
                 num_oov_tokens=1,
                 vocabulary=None,
                 reserve_zero=True,
                 mask_zero=False,
                 **kwargs):
        invert = False
        if invert:
            allowed_dtypes = [dtypes.int32, dtypes.int64]
        else:
            allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]

        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError("TextVectorization may only have a dtype in %s." %
                             allowed_dtypes)

        if "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.int64 if invert else dtypes.string

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, max_tokens must be greater than 1.")

        if num_oov_tokens < 0:
            raise ValueError(
                "num_oov_tokens must be greater than 0. You passed %s" %
                num_oov_tokens)

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_tokens = num_oov_tokens
        self.reserve_zero = reserve_zero
        self.mask_zero = mask_zero

        # We need to reserve at least num_oov_tokens tokens, plus one additional
        # value if we are reserving the zero value in our output.
        if reserve_zero:
            self._reserved_values = (num_oov_tokens + 1)
        else:
            self._reserved_values = num_oov_tokens

        # We need to account for the OOV buckets in our vocabulary size.
        if max_tokens is not None:
            self._max_elements = max_tokens - num_oov_tokens
        else:
            self._max_elements = None

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_tokens == 1:
            self._oov_value = 1 if reserve_zero else 0
        else:
            self._oov_value = -1

        super(IndexLookup,
              self).__init__(combiner=_IndexLookupCombiner(self.max_tokens),
                             **kwargs)

        # If the layer's input type is int32, we can only output int32 values -
        # MutableHashTable doesn't allow us to map int32->int64.
        if self.dtype == dtypes.int32:
            self._output_dtype = dtypes.int32
        else:
            self._output_dtype = dtypes.int64
        self._table = lookup_ops.MutableHashTable(
            key_dtype=self.dtype,
            value_dtype=self._output_dtype,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        if self.num_oov_tokens <= 1:
            oov_tokens = None
        else:
            oov_start = 1 if reserve_zero else 0
            oov_tokens = list(range(oov_start, self._reserved_values))

        self._table_handler = table_utils.TableHandler(
            table=self._table,
            oov_tokens=oov_tokens,
            use_v1_apis=self._use_v1_apis())

        if vocabulary is not None:
            if isinstance(vocabulary, str):
                vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
            table_utils.validate_vocabulary_is_unique(vocabulary)

            self.set_vocabulary(vocabulary)
コード例 #3
0
    def _set_forward_vocabulary(self, vocab):
        """Sets vocabulary data for this layer when inverse is False."""
        table_utils.validate_vocabulary_is_unique(vocab)

        should_have_mask = self.mask_token is not None
        has_mask = vocab[0] == self.mask_token
        oov_start = 1 if should_have_mask else 0

        should_have_oov = (self.num_oov_indices > 0) and not self.invert
        if should_have_oov:
            oov_end = oov_start + self.num_oov_indices
            expected_oov = [self.oov_token] * self.num_oov_indices
            has_oov = vocab[oov_start:oov_end] == expected_oov
            # If we get a numpy array, then has_oov may end up being a numpy array
            # instead of a bool. Fix this by collapsing the variable if it's not bool.
            if not isinstance(has_oov, bool):
                has_oov = any(has_oov)
        else:
            has_oov = False

        if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token=%s` and `oov_token=%s`. These tokens should"
                " be included in the provided vocabulary. "
                "The passed vocabulary has the correct mask token `%s` "
                "at index 0, but does not have the OOV token `%s` in "
                "indices [%s:%s]. Instead, we found `%s`. Was this "
                "vocabulary generated by a layer with incompatible "
                "settings?" %
                (self.mask_token, self.oov_token, self.mask_token,
                 self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end]))

        if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
            raise ValueError(
                "Invalid vocabulary format. The layer was created with "
                "`mask_token=%s` and `oov_token=%s`. These tokens should "
                "be included in the provided vocabulary. "
                "The passed vocabulary has the correct OOV token `%s` at "
                "indices [%s:%s], but does not have the mask token `%s` in "
                "index 0. Instead, we found `%s`. Was this vocabulary "
                "generated by a layer with incompatible settings?" %
                (self.mask_token, self.oov_token, self.oov_token, oov_start,
                 oov_end, self.mask_token, vocab[0]))

        insert_special_tokens = not has_oov and not has_mask

        special_tokens = [] if self.mask_token is None else [self.mask_token]
        special_tokens.extend([self.oov_token] * self.num_oov_indices)

        num_special_tokens = len(special_tokens)
        tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer." %
                (self.mask_token, tokens.index(self.mask_token)))
        if self.oov_token in tokens:
            raise ValueError(
                "Reserved OOV token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "OOV token for this layer." %
                (self.oov_token, tokens.index(self.oov_token)))

        if insert_special_tokens:
            total_vocab_size = len(vocab) + num_special_tokens
        else:
            total_vocab_size = len(vocab)
        if self.max_tokens is not None and total_vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is %s, max vocab size is %s." %
                (total_vocab_size, self.max_tokens))

        start_index = num_special_tokens
        values = np.arange(start_index,
                           len(vocab) + start_index,
                           dtype=np.int64)

        self._table_handler.clear()
        self._table_handler.insert(vocab, values)

        if insert_special_tokens and num_special_tokens > 0:
            special_token_values = np.arange(num_special_tokens,
                                             dtype=np.int64)
            self._table_handler.insert(special_tokens, special_token_values)
コード例 #4
0
  def _set_forward_vocabulary(self, vocab, idf_weights=None):
    """Sets vocabulary data for this layer when inverse is False."""
    table_utils.validate_vocabulary_is_unique(vocab)

    should_have_mask = self.mask_token is not None
    has_mask = vocab[0] == self.mask_token
    oov_start = 1 if should_have_mask else 0

    should_have_oov = (self.num_oov_indices > 0) and not self.invert
    if should_have_oov:
      oov_end = oov_start + self.num_oov_indices
      expected_oov = [self.oov_token] * self.num_oov_indices
      has_oov = vocab[oov_start:oov_end] == expected_oov
      # If we get a numpy array, then has_oov may end up being a numpy array
      # instead of a bool. Fix this by collapsing the variable if it's not bool.
      if not isinstance(has_oov, bool):
        has_oov = any(has_oov)
    else:
      has_oov = False

    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
      raise ValueError("Invalid vocabulary format. The layer was created with "
                       "`mask_token=%s` and `oov_token=%s`. These tokens should"
                       " be included in the provided vocabulary. "
                       "The passed vocabulary has the correct mask token `%s` "
                       "at index 0, but does not have the OOV token `%s` in "
                       "indices [%s:%s]. Instead, we found `%s`. Was this "
                       "vocabulary generated by a layer with incompatible "
                       "settings?" %
                       (self.mask_token, self.oov_token,
                        self.mask_token, self.oov_token, oov_start, oov_end,
                        vocab[oov_start:oov_end]))

    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
      raise ValueError(
          "Invalid vocabulary format. The layer was created with "
          "`mask_token=%s` and `oov_token=%s`. These tokens should "
          "be included in the provided vocabulary. "
          "The passed vocabulary has the correct OOV token `%s` at "
          "indices [%s:%s], but does not have the mask token `%s` in "
          "index 0. Instead, we found `%s`. Was this vocabulary "
          "generated by a layer with incompatible settings?" %
          (self.mask_token, self.oov_token, self.oov_token,
           oov_start, oov_end, self.mask_token, vocab[0]))

    special_tokens = [] if self.mask_token is None else [self.mask_token]
    special_tokens.extend([self.oov_token] * self.num_oov_indices)

    insert_special_tokens = special_tokens and not has_oov and not has_mask
    num_special_tokens = len(special_tokens)
    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
    if self.mask_token in tokens:
      raise ValueError("Reserved mask token %s was found in the passed "
                       "vocabulary at index %s. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "mask token for this layer." %
                       (self.mask_token, tokens.index(self.mask_token)))
    if self.oov_token in tokens:
      raise ValueError("Reserved OOV token %s was found in the passed "
                       "vocabulary at index %s. Please either remove the "
                       "reserved token from the vocabulary or change the "
                       "OOV token for this layer." %
                       (self.oov_token, tokens.index(self.oov_token)))

    total_vocab_size = len(tokens) + num_special_tokens
    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
      raise ValueError(
          "Attempted to set a vocabulary larger than the maximum vocab size. "
          "Passed vocab size is %s, max vocab size is %s." %
          (total_vocab_size, self.max_tokens))

    self._table_handler.clear()
    if insert_special_tokens:
      start_index = num_special_tokens
      values = np.arange(start_index, len(tokens) + start_index, dtype=np.int64)
      self._table_handler.insert(tokens, values)
      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
      self._table_handler.insert(special_tokens, special_token_values)
    else:
      values = np.arange(len(vocab), dtype=np.int64)
      self._table_handler.insert(vocab, values)

    if self.output_mode == TFIDF:
      if idf_weights is None:
        raise ValueError("idf_weights must be set if output_mode is TFIDF")
      if len(vocab) != len(idf_weights):
        raise ValueError("idf_weights must be the same length as vocab. "
                         "len(idf_weights) is %s, len(vocab) is %s" %
                         (len(vocab), len(idf_weights)))
      idf_weights = self._convert_to_ndarray(idf_weights)
      if idf_weights.ndim != 1:
        raise ValueError(
            "TF-IDF data must be a 1-index array, but received {}".format(
                type(idf_weights)))

      # If we inserted special tokens into the vocab, we need to pad the front
      # of idf_weights. We don't have real document frequencies for these tokens
      # so we will use an average of all idf_weights passed in as a reasonable
      # default.
      if insert_special_tokens:
        front_padding = num_special_tokens
        front_padding_value = np.average(idf_weights)
      else:
        front_padding = 0
        front_padding_value = 0
      # If pad_to_max_tokens is true, and max_tokens is greater than our total
      # vocab size, we need to pad the back of idf_weights with zeros as well.
      back_padding_value = 0
      if self.pad_to_max_tokens and self.max_tokens is not None:
        back_padding = self.max_tokens - total_vocab_size
      else:
        back_padding = 0
      idf_weights = np.pad(
          idf_weights, (front_padding, back_padding),
          "constant",
          constant_values=(front_padding_value, back_padding_value))
      K.set_value(self.tf_idf_weights, idf_weights)

    return total_vocab_size
コード例 #5
0
ファイル: index_lookup.py プロジェクト: wyt97/tensorflow
    def set_vocabulary(self, vocab):
        """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary for this layer directly, instead of
    analyzing a dataset through 'adapt'. It should be used whenever the vocab
    information is already known. If vocabulary data is already present in the
    layer, this method will either replace it

    Arguments:
      vocab: An array of string tokens.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
        input data is missing.
    """

        table_utils.validate_vocabulary_is_unique(vocab)

        should_have_mask = self.mask_token is not None
        if should_have_mask:
            has_mask = vocab[0] == self.mask_token
            oov_start = 1
        else:
            has_mask = False
            oov_start = 0

        should_have_oov = self.num_oov_indices > 0
        if should_have_oov:
            oov_end = oov_start + self.num_oov_indices
            expected_oov = [self.oov_token] * self.num_oov_indices
            has_oov = vocab[oov_start:oov_end] == expected_oov
            # If we get a numpy array, then has_oov may end up being a numpy array
            # instead of a bool. Fix this by collapsing the variable if it's not bool.
            if not isinstance(has_oov, bool):
                has_oov = any(has_oov)
        else:
            has_oov = False

        if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
            raise ValueError(
                "The passed vocabulary has the correct mask token `%s` "
                "at index 0, but does not have the OOV token `%s` in "
                "indices [%s:%s]. Instead, we found `%s`. Was this "
                "vocabulary generated by a layer with incompatible "
                "settings?" % (self.mask_token, self.oov_token, oov_start,
                               oov_end, vocab[oov_start:oov_end]))

        if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
            raise ValueError(
                "The passed vocabulary has the correct OOV token `%s` at "
                "indices [%s:%s], but does not have the mask token `%s` in "
                "index 0. Instead, we found `%s`. Was this vocabulary "
                "generated by a layer with incompatible settings?" %
                (self.oov_token, oov_start, oov_end, self.mask_token,
                 vocab[0]))

        insert_special_tokens = not has_oov and not has_mask

        special_tokens = [] if self.mask_token is None else [self.mask_token]
        special_tokens.extend([self.oov_token] * self.num_oov_indices)

        num_special_tokens = len(special_tokens)
        tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
        if self.mask_token in tokens:
            raise ValueError(
                "Reserved mask token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "mask token for this layer." %
                (self.mask_token, tokens.index(self.mask_token)))
        if self.oov_token in tokens:
            raise ValueError(
                "Reserved OOV token %s was found in the passed "
                "vocabulary at index %s. Please either remove the "
                "reserved token from the vocabulary or change the "
                "OOV token for this layer." %
                (self.oov_token, tokens.index(self.oov_token)))

        if insert_special_tokens:
            total_vocab_size = len(vocab) + num_special_tokens
        else:
            total_vocab_size = len(vocab)
        if self.max_tokens is not None and total_vocab_size > self.max_tokens:
            raise ValueError(
                "Attempted to set a vocabulary larger than the maximum vocab size. "
                "Passed vocab size is %s, max vocab size is %s." %
                (total_vocab_size, self.max_tokens))

        start_index = num_special_tokens
        values = np.arange(start_index,
                           len(vocab) + start_index,
                           dtype=np.int64)

        self._table_handler.clear()
        self._table_handler.insert(vocab, values)

        if insert_special_tokens and num_special_tokens > 0:
            special_token_values = np.arange(num_special_tokens,
                                             dtype=np.int64)
            self._table_handler.insert(special_tokens, special_token_values)