Ejemplo n.º 1
0
    def call(self, inputs):
        if not self.max_tokens and not self._vocab_size:
            raise ValueError(
                "You must set the layer's vocabulary before calling it. "
                "Either pass a `vocabulary` argument to the layer, or "
                "call `layer.adapt(dataset)` with some sample data.")
        self._called = True
        if self._key_dtype == tf.int64 and inputs.dtype == tf.int32:
            inputs = tf.cast(inputs, tf.int64)
        lookup_result = self._table_handler.lookup(inputs)

        if self.output_mode == INT:
            return lookup_result

        binary_output = (self.output_mode == BINARY)
        if self._vocab_size and not self.pad_to_max_tokens:
            out_depth = self._vocab_size
        else:
            out_depth = self.max_tokens
        if self.sparse:
            bincounts = category_encoding.sparse_bincount(
                lookup_result, out_depth, binary_output)
        else:
            bincounts = category_encoding.dense_bincount(
                lookup_result, out_depth, binary_output)

        if self.output_mode == TFIDF:
            return tf.multiply(bincounts, self.tf_idf_weights)

        return bincounts
Ejemplo n.º 2
0
    def _encode_output(self, lookup_result):
        original_shape = lookup_result.shape
        # In all cases, we should uprank scalar input to a single sample.
        if lookup_result.shape.rank == 0:
            lookup_result = self._expand_dims(lookup_result, -1)
        # One hot will unprank only if the final output dimension is not already 1.
        if self.output_mode == ONE_HOT:
            if lookup_result.shape[-1] != 1:
                lookup_result = self._expand_dims(lookup_result, -1)

        # TODO(b/190445202): remove output rank restriction.
        if lookup_result.shape.rank > 2:
            raise ValueError(
                "Received input shape {}, which would result in output rank {}. "
                "Currently only outputs up to rank 2 are supported for "
                "`output_mode={}`.".format(original_shape,
                                           lookup_result.shape.rank,
                                           self.output_mode))

        binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
        if self._vocab_size and not self.pad_to_max_tokens:
            out_depth = self._vocab_size
        else:
            out_depth = self.max_tokens
        if self.sparse:
            bincounts = category_encoding.sparse_bincount(
                lookup_result, out_depth, binary_output)
        else:
            bincounts = category_encoding.dense_bincount(
                lookup_result, out_depth, binary_output)

        if self.output_mode == TF_IDF:
            return tf.multiply(bincounts, self.tf_idf_weights)

        return bincounts
Ejemplo n.º 3
0
    def call(self, inputs):
        self._maybe_freeze_vocab_size()

        inputs = self._standardize_inputs(inputs, self._key_dtype)
        original_shape = inputs.shape
        # Some ops will not handle scalar input, so uprank to rank 1.
        if inputs.shape.rank == 0:
            inputs = self._expand_dims(inputs, -1)

        if tf_utils.is_sparse(inputs):
            lookups = tf.SparseTensor(inputs.indices,
                                      self._lookup_dense(inputs.values),
                                      inputs.dense_shape)
        elif tf_utils.is_ragged(inputs):
            lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs)
        else:
            lookups = self._lookup_dense(inputs)

        if self.output_mode == INT:
            # If we received a scalar input, downrank back to a scalar.
            if original_shape.rank == 0:
                lookups = tf.squeeze(lookups, -1)
            return lookups

        # One hot will unprank only if the final output dimension is not already 1.
        if self.output_mode == ONE_HOT:
            if lookups.shape[-1] != 1:
                lookups = self._expand_dims(lookups, -1)

        # TODO(b/190445202): remove output rank restriction.
        if lookups.shape.rank > 2:
            raise ValueError(
                "Received input shape {}, which would result in output rank {}. "
                "Currently only outputs up to rank 2 are supported for "
                "`output_mode={}`.".format(original_shape, lookups.shape.rank,
                                           self.output_mode))

        binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
        if self.pad_to_max_tokens:
            out_depth = self.max_tokens
        else:
            out_depth = self._frozen_vocab_size
        if self.sparse:
            bincounts = category_encoding.sparse_bincount(
                lookups, out_depth, binary_output)
        else:
            bincounts = category_encoding.dense_bincount(
                lookups, out_depth, binary_output)

        if self.output_mode == TF_IDF:
            return tf.multiply(bincounts, self.idf_weights_const)

        return bincounts
Ejemplo n.º 4
0
    def call(self, inputs):
        if not self.max_tokens and self._vocab_size is None:
            raise ValueError(
                "You must set the layer's vocabulary before calling it. "
                "Either pass a `vocabulary` argument to the layer, or "
                "call `layer.adapt(dataset)` with some sample data.")
        self._called = True
        if self._key_dtype == tf.int64 and inputs.dtype == tf.int32:
            inputs = tf.cast(inputs, tf.int64)
        lookup_result = self._table_handler.lookup(inputs)

        lookup_checks = []

        if self.num_oov_indices == 0 and not self.invert:
            if tf_utils.is_sparse(inputs):
                lookup_values = lookup_result.values
                input_values = inputs.values
            elif tf_utils.is_ragged(inputs):
                lookup_values = lookup_result.flat_values
                input_values = inputs.flat_values
            else:
                lookup_values = lookup_result
                input_values = inputs
            oov_indices = tf.where(tf.equal(lookup_values, -1))
            oov_inputs = tf.compat.v1.gather_nd(input_values, oov_indices)
            msg = tf.strings.format(
                "When `num_oov_indices=0` all inputs should be in vocabulary, "
                "found OOV values {}, consider setting `num_oov_indices=1`.",
                (oov_inputs, ))
            assertion = tf.Assert(tf.equal(tf.compat.v1.size(oov_indices), 0),
                                  [msg])
            lookup_checks.append(assertion)

        with tf.control_dependencies(lookup_checks):
            if self.output_mode == INT:
                return tf.identity(lookup_result)

            multi_hot_output = (self.output_mode == MULTI_HOT)
            if self._vocab_size and not self.pad_to_max_tokens:
                out_depth = self._vocab_size
            else:
                out_depth = self.max_tokens
            if self.sparse:
                bincounts = category_encoding.sparse_bincount(
                    lookup_result, out_depth, multi_hot_output)
            else:
                bincounts = category_encoding.dense_bincount(
                    lookup_result, out_depth, multi_hot_output)

            if self.output_mode == TF_IDF:
                return tf.multiply(bincounts, self.tf_idf_weights)

            return bincounts
Ejemplo n.º 5
0
  def call(self, inputs):
    if not self.max_tokens:
      raise ValueError("You must set the layer's vocabulary before calling it. "
                       "Either pass a `vocabulary` argument to the layer, or "
                       "call `layer.adapt(dataset)` with some sample data.")
    if self._key_dtype == tf.int64 and inputs.dtype == tf.int32:
      inputs = tf.cast(inputs, tf.int64)
    lookup_result = self._table_handler.lookup(inputs)
    if self.output_mode == INT:
      return lookup_result

    binary_output = (self.output_mode == BINARY)
    if self.sparse:
      return category_encoding.sparse_bincount(
          lookup_result, self.max_tokens, binary_output)
    else:
      return category_encoding.dense_bincount(
          lookup_result, self.max_tokens, binary_output)