def test_output_dtype(self):
     inputs = tf.constant([0, 1, 2], dtype=tf.dtypes.int32)
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs, output_mode="int", depth=4, dtype=tf.dtypes.int64)
     self.assertAllEqual(outputs.dtype, tf.dtypes.int64)
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs, output_mode="one_hot", depth=4, dtype=tf.dtypes.float64)
     self.assertAllEqual(outputs.dtype, tf.dtypes.float64)
Example #2
0
    def call(self, inputs):
        # Convert all inputs to tensors and check shape. This layer only supports
        # sclars and batches of scalars for the initial version.
        self._check_at_least_two_inputs(inputs)
        inputs = [utils.ensure_tensor(x) for x in inputs]
        self._check_input_shape_and_type(inputs)

        # Uprank to rank 2 for the cross_hashed op.
        rank = inputs[0].shape.rank
        if rank < 2:
            inputs = [utils.expand_dims(x, -1) for x in inputs]
        if rank < 1:
            inputs = [utils.expand_dims(x, -1) for x in inputs]

        # Perform the cross and convert to dense
        outputs = tf.sparse.cross_hashed(inputs, self.num_bins)
        outputs = tf.sparse.to_dense(outputs)

        # Fix output shape and downrank to match input rank.
        if rank == 2:
            # tf.sparse.cross_hashed output shape will always be None on the last
            # dimension. Given our input shape restrictions, we want to force shape 1
            # instead.
            outputs = tf.reshape(outputs, [-1, 1])
        elif rank == 1:
            outputs = tf.reshape(outputs, [-1])
        elif rank == 0:
            outputs = tf.reshape(outputs, [])

        # Encode outputs.
        return utils.encode_categorical_inputs(outputs,
                                               output_mode=self.output_mode,
                                               depth=self.num_bins,
                                               sparse=self.sparse,
                                               dtype=self.compute_dtype)
Example #3
0
    def call(self, inputs):
        self._maybe_freeze_vocab_size()

        inputs = self._standardize_inputs(inputs, self._key_dtype)
        original_shape = inputs.shape
        # Some ops will not handle scalar input, so uprank to rank 1.
        if inputs.shape.rank == 0:
            inputs = self._expand_dims(inputs, -1)

        if tf_utils.is_sparse(inputs):
            lookups = tf.SparseTensor(inputs.indices,
                                      self._lookup_dense(inputs.values),
                                      inputs.dense_shape)
        elif tf_utils.is_ragged(inputs):
            lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs)
        else:
            lookups = self._lookup_dense(inputs)

        if self.output_mode == INT:
            # If we received a scalar input, downrank back to a scalar.
            if original_shape.rank == 0:
                lookups = tf.squeeze(lookups, -1)
            return lookups

        depth = (self.max_tokens
                 if self.pad_to_max_tokens else self._frozen_vocab_size)
        idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None
        return utils.encode_categorical_inputs(lookups,
                                               output_mode=self.output_mode,
                                               depth=depth,
                                               dtype=self.compute_dtype,
                                               sparse=self.sparse,
                                               idf_weights=idf_weights)
Example #4
0
  def call(self, inputs, count_weights=None):
    inputs = utils.ensure_tensor(inputs)

    if count_weights is not None:
      if self.output_mode != COUNT:
        raise ValueError(
            "`count_weights` is not used when `output_mode` is not `'count'`. "
            "Received `count_weights={}`.".format(count_weights))
      count_weights = utils.ensure_tensor(count_weights, self.compute_dtype)

    depth = self.num_tokens
    if isinstance(inputs, tf.SparseTensor):
      max_value = tf.reduce_max(inputs.values)
      min_value = tf.reduce_min(inputs.values)
    else:
      max_value = tf.reduce_max(inputs)
      min_value = tf.reduce_min(inputs)
    condition = tf.logical_and(
        tf.greater(tf.cast(depth, max_value.dtype), max_value),
        tf.greater_equal(min_value, tf.cast(0, min_value.dtype)))
    assertion = tf.Assert(condition, [
        "Input values must be in the range 0 <= values < num_tokens"
        " with num_tokens={}".format(depth)
    ])
    with tf.control_dependencies([assertion]):
      return utils.encode_categorical_inputs(
          inputs,
          output_mode=self.output_mode,
          depth=depth,
          dtype=self.compute_dtype,
          sparse=self.sparse,
          count_weights=count_weights)
 def test_count_encoding(self, sparse):
     inputs = tf.constant([0, 1, 1, 2, 2, 2])
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs, output_mode="count", depth=4, sparse=sparse)
     if sparse:
         outputs = tf.sparse.to_dense(outputs)
     self.assertAllEqual([1, 2, 3, 0], outputs)
Example #6
0
 def test_one_hot_encoding(self, sparse):
     inputs = tf.constant([0, 1, 2])
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs, output_mode='one_hot', depth=4, sparse=sparse)
     if sparse:
         outputs = tf.sparse.to_dense(outputs)
     self.assertAllEqual([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]],
                         outputs)
Example #7
0
 def test_tf_idf_encoding(self, sparse):
     inputs = tf.constant([0, 1, 1, 2, 2, 2])
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs,
         output_mode='tf_idf',
         depth=4,
         sparse=sparse,
         idf_weights=[0.1, 1.0, 10.0, 0])
     if sparse:
         outputs = tf.sparse.to_dense(outputs)
     self.assertAllClose([.1, 2, 30, 0], outputs)
Example #8
0
 def call(self, inputs):
     inputs = utils.ensure_tensor(inputs)
     if isinstance(inputs, tf.SparseTensor):
         indices = tf.SparseTensor(indices=inputs.indices,
                                   values=self._hash_values_to_bins(
                                       inputs.values),
                                   dense_shape=inputs.dense_shape)
     else:
         indices = self._hash_values_to_bins(inputs)
     return utils.encode_categorical_inputs(indices,
                                            output_mode=self.output_mode,
                                            depth=self.num_bins,
                                            sparse=self.sparse,
                                            dtype=self.compute_dtype)
Example #9
0
    def call(self, inputs):
        def bucketize(inputs):
            return tf.raw_ops.Bucketize(input=inputs,
                                        boundaries=self.bin_boundaries)

        if tf_utils.is_ragged(inputs):
            indices = tf.ragged.map_flat_values(bucketize, inputs)
        elif tf_utils.is_sparse(inputs):
            indices = tf.SparseTensor(indices=tf.identity(inputs.indices),
                                      values=bucketize(inputs.values),
                                      dense_shape=tf.identity(
                                          inputs.dense_shape))
        else:
            indices = bucketize(inputs)

        return utils.encode_categorical_inputs(indices,
                                               output_mode=self.output_mode,
                                               depth=len(self.bin_boundaries) +
                                               1,
                                               sparse=self.sparse,
                                               dtype=self.compute_dtype)
 def test_int_encoding(self):
     inputs = tf.constant([0, 1, 2])
     outputs = preprocessing_utils.encode_categorical_inputs(
         inputs, output_mode="int", depth=4)
     self.assertAllEqual([0, 1, 2], outputs)
 def test_tf_idf_output_with_no_weights_fails(self):
     inputs = tf.constant([0, 1, 2])
     with self.assertRaisesRegex(ValueError,
                                 "idf_weights must be provided"):
         preprocessing_utils.encode_categorical_inputs(
             inputs, "tf_idf", 4, "float32")
 def test_rank_3_output_fails(self):
     inputs = tf.constant([[[0]], [[1]], [[2]]])
     with self.assertRaisesRegex(ValueError,
                                 "maximum supported output rank is 2"):
         preprocessing_utils.encode_categorical_inputs(
             inputs, "multi_hot", 4, "float32")