def test_output_dtype(self): inputs = tf.constant([0, 1, 2], dtype=tf.dtypes.int32) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode="int", depth=4, dtype=tf.dtypes.int64) self.assertAllEqual(outputs.dtype, tf.dtypes.int64) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode="one_hot", depth=4, dtype=tf.dtypes.float64) self.assertAllEqual(outputs.dtype, tf.dtypes.float64)
def call(self, inputs): # Convert all inputs to tensors and check shape. This layer only supports # sclars and batches of scalars for the initial version. self._check_at_least_two_inputs(inputs) inputs = [utils.ensure_tensor(x) for x in inputs] self._check_input_shape_and_type(inputs) # Uprank to rank 2 for the cross_hashed op. rank = inputs[0].shape.rank if rank < 2: inputs = [utils.expand_dims(x, -1) for x in inputs] if rank < 1: inputs = [utils.expand_dims(x, -1) for x in inputs] # Perform the cross and convert to dense outputs = tf.sparse.cross_hashed(inputs, self.num_bins) outputs = tf.sparse.to_dense(outputs) # Fix output shape and downrank to match input rank. if rank == 2: # tf.sparse.cross_hashed output shape will always be None on the last # dimension. Given our input shape restrictions, we want to force shape 1 # instead. outputs = tf.reshape(outputs, [-1, 1]) elif rank == 1: outputs = tf.reshape(outputs, [-1]) elif rank == 0: outputs = tf.reshape(outputs, []) # Encode outputs. return utils.encode_categorical_inputs(outputs, output_mode=self.output_mode, depth=self.num_bins, sparse=self.sparse, dtype=self.compute_dtype)
def call(self, inputs): self._maybe_freeze_vocab_size() inputs = self._standardize_inputs(inputs, self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups depth = (self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size) idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None return utils.encode_categorical_inputs(lookups, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, idf_weights=idf_weights)
def call(self, inputs, count_weights=None): inputs = utils.ensure_tensor(inputs) if count_weights is not None: if self.output_mode != COUNT: raise ValueError( "`count_weights` is not used when `output_mode` is not `'count'`. " "Received `count_weights={}`.".format(count_weights)) count_weights = utils.ensure_tensor(count_weights, self.compute_dtype) depth = self.num_tokens if isinstance(inputs, tf.SparseTensor): max_value = tf.reduce_max(inputs.values) min_value = tf.reduce_min(inputs.values) else: max_value = tf.reduce_max(inputs) min_value = tf.reduce_min(inputs) condition = tf.logical_and( tf.greater(tf.cast(depth, max_value.dtype), max_value), tf.greater_equal(min_value, tf.cast(0, min_value.dtype))) assertion = tf.Assert(condition, [ "Input values must be in the range 0 <= values < num_tokens" " with num_tokens={}".format(depth) ]) with tf.control_dependencies([assertion]): return utils.encode_categorical_inputs( inputs, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, count_weights=count_weights)
def test_count_encoding(self, sparse): inputs = tf.constant([0, 1, 1, 2, 2, 2]) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode="count", depth=4, sparse=sparse) if sparse: outputs = tf.sparse.to_dense(outputs) self.assertAllEqual([1, 2, 3, 0], outputs)
def test_one_hot_encoding(self, sparse): inputs = tf.constant([0, 1, 2]) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode='one_hot', depth=4, sparse=sparse) if sparse: outputs = tf.sparse.to_dense(outputs) self.assertAllEqual([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]], outputs)
def test_tf_idf_encoding(self, sparse): inputs = tf.constant([0, 1, 1, 2, 2, 2]) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode='tf_idf', depth=4, sparse=sparse, idf_weights=[0.1, 1.0, 10.0, 0]) if sparse: outputs = tf.sparse.to_dense(outputs) self.assertAllClose([.1, 2, 30, 0], outputs)
def call(self, inputs): inputs = utils.ensure_tensor(inputs) if isinstance(inputs, tf.SparseTensor): indices = tf.SparseTensor(indices=inputs.indices, values=self._hash_values_to_bins( inputs.values), dense_shape=inputs.dense_shape) else: indices = self._hash_values_to_bins(inputs) return utils.encode_categorical_inputs(indices, output_mode=self.output_mode, depth=self.num_bins, sparse=self.sparse, dtype=self.compute_dtype)
def call(self, inputs): def bucketize(inputs): return tf.raw_ops.Bucketize(input=inputs, boundaries=self.bin_boundaries) if tf_utils.is_ragged(inputs): indices = tf.ragged.map_flat_values(bucketize, inputs) elif tf_utils.is_sparse(inputs): indices = tf.SparseTensor(indices=tf.identity(inputs.indices), values=bucketize(inputs.values), dense_shape=tf.identity( inputs.dense_shape)) else: indices = bucketize(inputs) return utils.encode_categorical_inputs(indices, output_mode=self.output_mode, depth=len(self.bin_boundaries) + 1, sparse=self.sparse, dtype=self.compute_dtype)
def test_int_encoding(self): inputs = tf.constant([0, 1, 2]) outputs = preprocessing_utils.encode_categorical_inputs( inputs, output_mode="int", depth=4) self.assertAllEqual([0, 1, 2], outputs)
def test_tf_idf_output_with_no_weights_fails(self): inputs = tf.constant([0, 1, 2]) with self.assertRaisesRegex(ValueError, "idf_weights must be provided"): preprocessing_utils.encode_categorical_inputs( inputs, "tf_idf", 4, "float32")
def test_rank_3_output_fails(self): inputs = tf.constant([[[0]], [[1]], [[2]]]) with self.assertRaisesRegex(ValueError, "maximum supported output rank is 2"): preprocessing_utils.encode_categorical_inputs( inputs, "multi_hot", 4, "float32")