コード例 #1
0
  def call(self, inputs, count_weights=None):
    inputs = utils.ensure_tensor(inputs)

    if count_weights is not None:
      if self.output_mode != COUNT:
        raise ValueError(
            "`count_weights` is not used when `output_mode` is not `'count'`. "
            "Received `count_weights={}`.".format(count_weights))
      count_weights = utils.ensure_tensor(count_weights, self.compute_dtype)

    depth = self.num_tokens
    if isinstance(inputs, tf.SparseTensor):
      max_value = tf.reduce_max(inputs.values)
      min_value = tf.reduce_min(inputs.values)
    else:
      max_value = tf.reduce_max(inputs)
      min_value = tf.reduce_min(inputs)
    condition = tf.logical_and(
        tf.greater(tf.cast(depth, max_value.dtype), max_value),
        tf.greater_equal(min_value, tf.cast(0, min_value.dtype)))
    assertion = tf.Assert(condition, [
        "Input values must be in the range 0 <= values < num_tokens"
        " with num_tokens={}".format(depth)
    ])
    with tf.control_dependencies([assertion]):
      return utils.encode_categorical_inputs(
          inputs,
          output_mode=self.output_mode,
          depth=depth,
          dtype=self.compute_dtype,
          sparse=self.sparse,
          count_weights=count_weights)
コード例 #2
0
    def call(self, inputs):
        # Convert all inputs to tensors and check shape. This layer only supports
        # sclars and batches of scalars for the initial version.
        self._check_at_least_two_inputs(inputs)
        inputs = [utils.ensure_tensor(x) for x in inputs]
        self._check_input_shape_and_type(inputs)

        # Uprank to rank 2 for the cross_hashed op.
        rank = inputs[0].shape.rank
        if rank < 2:
            inputs = [utils.expand_dims(x, -1) for x in inputs]
        if rank < 1:
            inputs = [utils.expand_dims(x, -1) for x in inputs]

        # Perform the cross and convert to dense
        outputs = tf.sparse.cross_hashed(inputs, self.num_bins)
        outputs = tf.sparse.to_dense(outputs)

        # Fix output shape and downrank to match input rank.
        if rank == 2:
            # tf.sparse.cross_hashed output shape will always be None on the last
            # dimension. Given our input shape restrictions, we want to force shape 1
            # instead.
            outputs = tf.reshape(outputs, [-1, 1])
        elif rank == 1:
            outputs = tf.reshape(outputs, [-1])
        elif rank == 0:
            outputs = tf.reshape(outputs, [])

        # Encode outputs.
        return utils.encode_categorical_inputs(outputs,
                                               output_mode=self.output_mode,
                                               depth=self.num_bins,
                                               sparse=self.sparse,
                                               dtype=self.compute_dtype)
コード例 #3
0
    def update_state(self, data):
        if self._has_input_vocabulary:
            raise ValueError(
                "Cannot adapt {} layer after setting a static vocabulary via "
                "init argument "
                "or `set_vocabulary`.".format(self.__class__.__name__))

        data = utils.ensure_tensor(data, dtype=self.vocabulary_dtype)
        if data.shape.rank == 0:
            data = tf.expand_dims(data, 0)
        if data.shape.rank == 1:
            # Expand dims on axis 0 for tf-idf. A 1-d tensor is a single
            # document.
            data = tf.expand_dims(data, 0)

        tokens, counts = self._num_tokens(data)
        self.token_counts.insert(tokens,
                                 counts + self.token_counts.lookup(tokens))

        if self.output_mode == TF_IDF:
            # Dedupe each row of our dataset.
            deduped_doc_data = tf.map_fn(lambda x: tf.unique(x)[0], data)
            # Flatten and count tokens.
            tokens, doc_counts = self._num_tokens(deduped_doc_data)
            self.token_document_counts.insert(
                tokens, doc_counts + self.token_document_counts.lookup(tokens))
            if tf_utils.is_ragged(data):
                self.num_documents.assign_add(data.nrows())
            else:
                self.num_documents.assign_add(
                    tf.shape(data, out_type=tf.int64)[0])
コード例 #4
0
    def call(self, inputs):
        self._maybe_freeze_vocab_size()

        inputs = utils.ensure_tensor(inputs, dtype=self._key_dtype)
        original_shape = inputs.shape
        # Some ops will not handle scalar input, so uprank to rank 1.
        if inputs.shape.rank == 0:
            inputs = self._expand_dims(inputs, -1)

        if tf_utils.is_sparse(inputs):
            lookups = tf.SparseTensor(inputs.indices,
                                      self._lookup_dense(inputs.values),
                                      inputs.dense_shape)
        elif tf_utils.is_ragged(inputs):
            lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs)
        else:
            lookups = self._lookup_dense(inputs)

        if self.output_mode == INT:
            # If we received a scalar input, downrank back to a scalar.
            if original_shape.rank == 0:
                lookups = tf.squeeze(lookups, -1)
            return lookups

        depth = (self.max_tokens
                 if self.pad_to_max_tokens else self._frozen_vocab_size)
        idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None
        return utils.encode_categorical_inputs(lookups,
                                               output_mode=self.output_mode,
                                               depth=depth,
                                               dtype=self.compute_dtype,
                                               sparse=self.sparse,
                                               idf_weights=idf_weights)
コード例 #5
0
    def _preprocess(self, inputs):
        inputs = utils.ensure_tensor(inputs, dtype=tf.string)
        if self._standardize in (LOWER, LOWER_AND_STRIP_PUNCTUATION):
            inputs = tf.strings.lower(inputs)
        if self._standardize in (
                STRIP_PUNCTUATION,
                LOWER_AND_STRIP_PUNCTUATION,
        ):
            inputs = tf.strings.regex_replace(inputs, DEFAULT_STRIP_REGEX, "")
        if callable(self._standardize):
            inputs = self._standardize(inputs)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension
            # 1 and so can be squeezed out. We do this here instead of after
            # splitting for performance reasons - it's more expensive to squeeze
            # a ragged tensor.
            if inputs.shape.rank > 1:
                if inputs.shape[-1] != 1:
                    raise ValueError(
                        "When using `TextVectorization` to tokenize strings, "
                        "the input rank must be 1 or the last shape dimension "
                        f"must be 1. Received: inputs.shape={inputs.shape} "
                        f"with rank={inputs.shape.rank}")
                else:
                    inputs = tf.squeeze(inputs, axis=-1)
            if self._split == WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips
                # leading and trailing whitespace.
                inputs = tf.strings.split(inputs)
            elif self._split == CHARACTER:
                inputs = tf.strings.unicode_split(inputs, "UTF-8")
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however,
        # does support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = tf.strings.ngrams(inputs,
                                       ngram_width=self._ngrams,
                                       separator=" ")

        return inputs
コード例 #6
0
ファイル: hashing.py プロジェクト: huaxz1986/keras
 def call(self, inputs):
     inputs = utils.ensure_tensor(inputs)
     if isinstance(inputs, tf.SparseTensor):
         indices = tf.SparseTensor(indices=inputs.indices,
                                   values=self._hash_values_to_bins(
                                       inputs.values),
                                   dense_shape=inputs.dense_shape)
     else:
         indices = self._hash_values_to_bins(inputs)
     return utils.encode_categorical_inputs(indices,
                                            output_mode=self.output_mode,
                                            depth=self.num_bins,
                                            sparse=self.sparse,
                                            dtype=self.compute_dtype)