def call(self, inputs): if not self.max_tokens and not self._vocab_size: raise ValueError( "You must set the layer's vocabulary before calling it. " "Either pass a `vocabulary` argument to the layer, or " "call `layer.adapt(dataset)` with some sample data.") self._called = True if self._key_dtype == tf.int64 and inputs.dtype == tf.int32: inputs = tf.cast(inputs, tf.int64) lookup_result = self._table_handler.lookup(inputs) if self.output_mode == INT: return lookup_result binary_output = (self.output_mode == BINARY) if self._vocab_size and not self.pad_to_max_tokens: out_depth = self._vocab_size else: out_depth = self.max_tokens if self.sparse: bincounts = category_encoding.sparse_bincount( lookup_result, out_depth, binary_output) else: bincounts = category_encoding.dense_bincount( lookup_result, out_depth, binary_output) if self.output_mode == TFIDF: return tf.multiply(bincounts, self.tf_idf_weights) return bincounts
def _encode_output(self, lookup_result): original_shape = lookup_result.shape # In all cases, we should uprank scalar input to a single sample. if lookup_result.shape.rank == 0: lookup_result = self._expand_dims(lookup_result, -1) # One hot will unprank only if the final output dimension is not already 1. if self.output_mode == ONE_HOT: if lookup_result.shape[-1] != 1: lookup_result = self._expand_dims(lookup_result, -1) # TODO(b/190445202): remove output rank restriction. if lookup_result.shape.rank > 2: raise ValueError( "Received input shape {}, which would result in output rank {}. " "Currently only outputs up to rank 2 are supported for " "`output_mode={}`.".format(original_shape, lookup_result.shape.rank, self.output_mode)) binary_output = self.output_mode in (MULTI_HOT, ONE_HOT) if self._vocab_size and not self.pad_to_max_tokens: out_depth = self._vocab_size else: out_depth = self.max_tokens if self.sparse: bincounts = category_encoding.sparse_bincount( lookup_result, out_depth, binary_output) else: bincounts = category_encoding.dense_bincount( lookup_result, out_depth, binary_output) if self.output_mode == TF_IDF: return tf.multiply(bincounts, self.tf_idf_weights) return bincounts
def call(self, inputs): self._maybe_freeze_vocab_size() inputs = self._standardize_inputs(inputs, self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups # One hot will unprank only if the final output dimension is not already 1. if self.output_mode == ONE_HOT: if lookups.shape[-1] != 1: lookups = self._expand_dims(lookups, -1) # TODO(b/190445202): remove output rank restriction. if lookups.shape.rank > 2: raise ValueError( "Received input shape {}, which would result in output rank {}. " "Currently only outputs up to rank 2 are supported for " "`output_mode={}`.".format(original_shape, lookups.shape.rank, self.output_mode)) binary_output = self.output_mode in (MULTI_HOT, ONE_HOT) if self.pad_to_max_tokens: out_depth = self.max_tokens else: out_depth = self._frozen_vocab_size if self.sparse: bincounts = category_encoding.sparse_bincount( lookups, out_depth, binary_output) else: bincounts = category_encoding.dense_bincount( lookups, out_depth, binary_output) if self.output_mode == TF_IDF: return tf.multiply(bincounts, self.idf_weights_const) return bincounts
def call(self, inputs): if not self.max_tokens and self._vocab_size is None: raise ValueError( "You must set the layer's vocabulary before calling it. " "Either pass a `vocabulary` argument to the layer, or " "call `layer.adapt(dataset)` with some sample data.") self._called = True if self._key_dtype == tf.int64 and inputs.dtype == tf.int32: inputs = tf.cast(inputs, tf.int64) lookup_result = self._table_handler.lookup(inputs) lookup_checks = [] if self.num_oov_indices == 0 and not self.invert: if tf_utils.is_sparse(inputs): lookup_values = lookup_result.values input_values = inputs.values elif tf_utils.is_ragged(inputs): lookup_values = lookup_result.flat_values input_values = inputs.flat_values else: lookup_values = lookup_result input_values = inputs oov_indices = tf.where(tf.equal(lookup_values, -1)) oov_inputs = tf.compat.v1.gather_nd(input_values, oov_indices) msg = tf.strings.format( "When `num_oov_indices=0` all inputs should be in vocabulary, " "found OOV values {}, consider setting `num_oov_indices=1`.", (oov_inputs, )) assertion = tf.Assert(tf.equal(tf.compat.v1.size(oov_indices), 0), [msg]) lookup_checks.append(assertion) with tf.control_dependencies(lookup_checks): if self.output_mode == INT: return tf.identity(lookup_result) multi_hot_output = (self.output_mode == MULTI_HOT) if self._vocab_size and not self.pad_to_max_tokens: out_depth = self._vocab_size else: out_depth = self.max_tokens if self.sparse: bincounts = category_encoding.sparse_bincount( lookup_result, out_depth, multi_hot_output) else: bincounts = category_encoding.dense_bincount( lookup_result, out_depth, multi_hot_output) if self.output_mode == TF_IDF: return tf.multiply(bincounts, self.tf_idf_weights) return bincounts
def call(self, inputs): if not self.max_tokens: raise ValueError("You must set the layer's vocabulary before calling it. " "Either pass a `vocabulary` argument to the layer, or " "call `layer.adapt(dataset)` with some sample data.") if self._key_dtype == tf.int64 and inputs.dtype == tf.int32: inputs = tf.cast(inputs, tf.int64) lookup_result = self._table_handler.lookup(inputs) if self.output_mode == INT: return lookup_result binary_output = (self.output_mode == BINARY) if self.sparse: return category_encoding.sparse_bincount( lookup_result, self.max_tokens, binary_output) else: return category_encoding.dense_bincount( lookup_result, self.max_tokens, binary_output)