def tfidf(x, vocab_size, smooth=True, name=None): """Maps the terms in x to their term frequency * inverse document frequency. The inverse document frequency of a term is calculated as 1+ log((corpus size + 1) / (document frequency of term + 1)) by default. Example usage: example strings [["I", "like", "pie", "pie", "pie"], ["yum", "yum", "pie]] in: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2]], values=[1, 2, 0, 0, 0, 3, 3, 0]) out: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]], values=[1, 2, 0, 3, 0]) SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]], values=[(1/5)*(log(3/2)+1), (1/5)*(log(3/2)+1), (1/5), (1/3), (2/3)*(log(3/2)+1]) NOTE that the first doc's duplicate "pie" strings have been combined to one output, as have the second doc's duplicate "yum" strings. Args: x: A `SparseTensor` representing int64 values (most likely that are the result of calling string_to_int on a tokenized string). vocab_size: An int - the count of vocab used to turn the string into int64s including any OOV buckets. smooth: A bool indicating if the inverse document frequency should be smoothed. If True, which is the default, then the idf is calculated as 1 + log((corpus size + 1) / (document frequency of term + 1)). Otherwise, the idf is 1 +log((corpus size) / (document frequency of term)), which could result in a division by zero error. name: (Optional) A name for this operation. Returns: Two `SparseTensor`s with indices [index_in_batch, index_in_bag_of_words]. The first has values vocab_index, which is taken from input `x`. The second has values tfidf_weight. """ def _to_vocab_range(x): """Enforces that the vocab_ids in x are positive.""" return tf.SparseTensor( indices=x.indices, values=tf.mod(x.values, vocab_size), dense_shape=x.dense_shape) with tf.name_scope(name, 'tfidf'): cleaned_input = _to_vocab_range(x) term_frequencies = _to_term_frequency(cleaned_input, vocab_size) count_docs_with_term_column = _count_docs_with_term(term_frequencies) # Expand dims to get around the min_tensor_rank checks sizes = tf.expand_dims(tf.shape(cleaned_input)[0], 0) # [batch, vocab] - tfidf tfidfs = _to_tfidf(term_frequencies, analyzers.sum(count_docs_with_term_column, reduce_instance_dims=False), analyzers.sum(sizes), smooth) return _split_tfidfs_to_outputs(tfidfs)
def tfidf_weights(x, vocab_size): """Maps the terms in x to their (1/doc_length) * inverse document frequency. Args: x: A `SparseTensor` representing int64 values (most likely that are the result of calling string_to_int on a tokenized string). vocab_size: An int - the count of vocab used to turn the string into int64s including any OOV buckets. Returns: A `SparseTensor` where each int value is mapped to a double equal to (1 if that term appears in that row, 0 otherwise / the number of terms in that row) * the log of (the number of rows in `x` / (1 + the number of rows in `x` where the term appears at least once)) NOTE: This is intented to be used with the feature_column 'sum' combiner to arrive at the true term frequncies. """ def _to_vocab_range(x): """Enforces that the vocab_ids in x are positive.""" return tf.SparseTensor(indices=x.indices, values=tf.mod(x.values, vocab_size), dense_shape=x.dense_shape) def _to_doc_contains_term(x): """Creates a SparseTensor with 1s at every doc/term pair index. Args: x : a SparseTensor of int64 representing string indices in vocab. Returns: a SparseTensor with 1s at indices <doc_index_in_batch>, <term_index_in_vocab> for every term/doc pair. """ # Construct intermediary sparse tensor with indices # [<doc>, <term_index_in_doc>, <vocab_id>] and tf.ones values. split_indices = tf.to_int64( tf.split(x.indices, axis=1, num_or_size_splits=2)) expanded_values = tf.to_int64(tf.expand_dims(x.values, 1)) next_index = tf.concat( [split_indices[0], split_indices[1], expanded_values], axis=1) next_values = tf.ones_like(x.values) vocab_size_as_tensor = tf.constant([vocab_size], dtype=tf.int64) next_shape = tf.concat([x.dense_shape, vocab_size_as_tensor], 0) next_tensor = tf.SparseTensor(indices=tf.to_int64(next_index), values=next_values, dense_shape=next_shape) # Take the intermediar tensor and reduce over the term_index_in_doc # dimension. This produces a tensor with indices [<doc_id>, <term_id>] # and values [count_of_term_in_doc] and shape batch x vocab_size term_count_per_doc = tf.sparse_reduce_sum_sparse(next_tensor, 1) one_if_doc_contains_term = tf.SparseTensor( indices=term_count_per_doc.indices, values=tf.to_double(tf.greater(term_count_per_doc.values, 0)), dense_shape=term_count_per_doc.dense_shape) return one_if_doc_contains_term def _to_idf_over_doc_size(x, reduced_term_freq, corpus_size): """Calculates the inverse document frequency of terms in the corpus. Args: x : a `SparseTensor` of int64 representing string indices in vocab. reduced_term_freq: A `Tensor` of shape (vocabSize,) that represents the count of the number of documents with each term. corpus_size: A scalar count of the number of documents in the corpus Returns: The tf*idf values """ # Add one to the reduced term freqnencies to avoid dividing by zero. idf = tf.log( tf.to_double(corpus_size) / (1.0 + tf.to_double(reduced_term_freq))) dense_doc_sizes = tf.to_double( tf.sparse_reduce_sum( tf.SparseTensor(indices=x.indices, values=tf.ones_like(x.values), dense_shape=x.dense_shape), 1)) # For every term in x, divide the idf by the doc size. # The two gathers both result in shape <sum_doc_sizes> idf_over_doc_size = (tf.gather(idf, x.values) / tf.gather(dense_doc_sizes, x.indices[:, 0])) return tf.SparseTensor(indices=x.indices, values=tf.to_float(idf_over_doc_size), dense_shape=x.dense_shape) cleaned_input = _to_vocab_range(x) docs_with_terms = _to_doc_contains_term(cleaned_input) def count_docs_with_term(term_frequency): # Sum w/in batch. count_of_doc_inter = tf.SparseTensor( indices=term_frequency.indices, values=tf.ones_like(term_frequency.values), dense_shape=term_frequency.dense_shape) out = tf.sparse_reduce_sum(count_of_doc_inter, axis=0) return tf.expand_dims(out, 0) count_docs_with_term_column = count_docs_with_term(docs_with_terms) # Expand dims to get around the min_tensor_rank checks sizes = tf.expand_dims(tf.shape(cleaned_input)[0], 0) return _to_idf_over_doc_size( cleaned_input, analyzers.sum(count_docs_with_term_column, reduce_instance_dims=False), analyzers.sum(sizes))