def call(self, inputs, count_weights=None): inputs = utils.ensure_tensor(inputs) if count_weights is not None: if self.output_mode != COUNT: raise ValueError( "`count_weights` is not used when `output_mode` is not `'count'`. " "Received `count_weights={}`.".format(count_weights)) count_weights = utils.ensure_tensor(count_weights, self.compute_dtype) depth = self.num_tokens if isinstance(inputs, tf.SparseTensor): max_value = tf.reduce_max(inputs.values) min_value = tf.reduce_min(inputs.values) else: max_value = tf.reduce_max(inputs) min_value = tf.reduce_min(inputs) condition = tf.logical_and( tf.greater(tf.cast(depth, max_value.dtype), max_value), tf.greater_equal(min_value, tf.cast(0, min_value.dtype))) assertion = tf.Assert(condition, [ "Input values must be in the range 0 <= values < num_tokens" " with num_tokens={}".format(depth) ]) with tf.control_dependencies([assertion]): return utils.encode_categorical_inputs( inputs, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, count_weights=count_weights)
def call(self, inputs): # Convert all inputs to tensors and check shape. This layer only supports # sclars and batches of scalars for the initial version. self._check_at_least_two_inputs(inputs) inputs = [utils.ensure_tensor(x) for x in inputs] self._check_input_shape_and_type(inputs) # Uprank to rank 2 for the cross_hashed op. rank = inputs[0].shape.rank if rank < 2: inputs = [utils.expand_dims(x, -1) for x in inputs] if rank < 1: inputs = [utils.expand_dims(x, -1) for x in inputs] # Perform the cross and convert to dense outputs = tf.sparse.cross_hashed(inputs, self.num_bins) outputs = tf.sparse.to_dense(outputs) # Fix output shape and downrank to match input rank. if rank == 2: # tf.sparse.cross_hashed output shape will always be None on the last # dimension. Given our input shape restrictions, we want to force shape 1 # instead. outputs = tf.reshape(outputs, [-1, 1]) elif rank == 1: outputs = tf.reshape(outputs, [-1]) elif rank == 0: outputs = tf.reshape(outputs, []) # Encode outputs. return utils.encode_categorical_inputs(outputs, output_mode=self.output_mode, depth=self.num_bins, sparse=self.sparse, dtype=self.compute_dtype)
def update_state(self, data): if self._has_input_vocabulary: raise ValueError( "Cannot adapt {} layer after setting a static vocabulary via " "init argument " "or `set_vocabulary`.".format(self.__class__.__name__)) data = utils.ensure_tensor(data, dtype=self.vocabulary_dtype) if data.shape.rank == 0: data = tf.expand_dims(data, 0) if data.shape.rank == 1: # Expand dims on axis 0 for tf-idf. A 1-d tensor is a single # document. data = tf.expand_dims(data, 0) tokens, counts = self._num_tokens(data) self.token_counts.insert(tokens, counts + self.token_counts.lookup(tokens)) if self.output_mode == TF_IDF: # Dedupe each row of our dataset. deduped_doc_data = tf.map_fn(lambda x: tf.unique(x)[0], data) # Flatten and count tokens. tokens, doc_counts = self._num_tokens(deduped_doc_data) self.token_document_counts.insert( tokens, doc_counts + self.token_document_counts.lookup(tokens)) if tf_utils.is_ragged(data): self.num_documents.assign_add(data.nrows()) else: self.num_documents.assign_add( tf.shape(data, out_type=tf.int64)[0])
def call(self, inputs): self._maybe_freeze_vocab_size() inputs = utils.ensure_tensor(inputs, dtype=self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups depth = (self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size) idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None return utils.encode_categorical_inputs(lookups, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, idf_weights=idf_weights)
def _preprocess(self, inputs): inputs = utils.ensure_tensor(inputs, dtype=tf.string) if self._standardize in (LOWER, LOWER_AND_STRIP_PUNCTUATION): inputs = tf.strings.lower(inputs) if self._standardize in ( STRIP_PUNCTUATION, LOWER_AND_STRIP_PUNCTUATION, ): inputs = tf.strings.regex_replace(inputs, DEFAULT_STRIP_REGEX, "") if callable(self._standardize): inputs = self._standardize(inputs) if self._split is not None: # If we are splitting, we validate that the 1st axis is of dimension # 1 and so can be squeezed out. We do this here instead of after # splitting for performance reasons - it's more expensive to squeeze # a ragged tensor. if inputs.shape.rank > 1: if inputs.shape[-1] != 1: raise ValueError( "When using `TextVectorization` to tokenize strings, " "the input rank must be 1 or the last shape dimension " f"must be 1. Received: inputs.shape={inputs.shape} " f"with rank={inputs.shape.rank}") else: inputs = tf.squeeze(inputs, axis=-1) if self._split == WHITESPACE: # This treats multiple whitespaces as one whitespace, and strips # leading and trailing whitespace. inputs = tf.strings.split(inputs) elif self._split == CHARACTER: inputs = tf.strings.unicode_split(inputs, "UTF-8") elif callable(self._split): inputs = self._split(inputs) else: raise ValueError( ("%s is not a supported splitting." "TextVectorization supports the following options " "for `split`: None, 'whitespace', or a Callable.") % self._split) # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, # does support both ragged and dense inputs. if self._ngrams is not None: inputs = tf.strings.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def call(self, inputs): inputs = utils.ensure_tensor(inputs) if isinstance(inputs, tf.SparseTensor): indices = tf.SparseTensor(indices=inputs.indices, values=self._hash_values_to_bins( inputs.values), dense_shape=inputs.dense_shape) else: indices = self._hash_values_to_bins(inputs) return utils.encode_categorical_inputs(indices, output_mode=self.output_mode, depth=self.num_bins, sparse=self.sparse, dtype=self.compute_dtype)