Beispiel #1
0
def tfidf(x, vocab_size, smooth=True, name=None):
  """Maps the terms in x to their term frequency * inverse document frequency.

  The inverse document frequency of a term is calculated as 1+
  log((corpus size + 1) / (document frequency of term + 1)) by default.

  Example usage:
    example strings [["I", "like", "pie", "pie", "pie"], ["yum", "yum", "pie]]
    in: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                              [1, 0], [1, 1], [1, 2]],
                     values=[1, 2, 0, 0, 0, 3, 3, 0])
    out: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                      values=[1, 2, 0, 3, 0])
         SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                      values=[(1/5)*(log(3/2)+1), (1/5)*(log(3/2)+1), (1/5),
                              (1/3), (2/3)*(log(3/2)+1])
    NOTE that the first doc's duplicate "pie" strings have been combined to
    one output, as have the second doc's duplicate "yum" strings.

  Args:
    x: A `SparseTensor` representing int64 values (most likely that are the
        result of calling string_to_int on a tokenized string).
    vocab_size: An int - the count of vocab used to turn the string into int64s
        including any OOV buckets.
    smooth: A bool indicating if the inverse document frequency should be
        smoothed. If True, which is the default, then the idf is calculated as
        1 + log((corpus size + 1) / (document frequency of term + 1)).
        Otherwise, the idf is
        1 +log((corpus size) / (document frequency of term)), which could
        result in a division by zero error.
    name: (Optional) A name for this operation.

  Returns:
    Two `SparseTensor`s with indices [index_in_batch, index_in_bag_of_words].
    The first has values vocab_index, which is taken from input `x`.
    The second has values tfidf_weight.
  """

  def _to_vocab_range(x):
    """Enforces that the vocab_ids in x are positive."""
    return tf.SparseTensor(
        indices=x.indices,
        values=tf.mod(x.values, vocab_size),
        dense_shape=x.dense_shape)

  with tf.name_scope(name, 'tfidf'):
    cleaned_input = _to_vocab_range(x)

    term_frequencies = _to_term_frequency(cleaned_input, vocab_size)

    count_docs_with_term_column = _count_docs_with_term(term_frequencies)
    # Expand dims to get around the min_tensor_rank checks
    sizes = tf.expand_dims(tf.shape(cleaned_input)[0], 0)
    # [batch, vocab] - tfidf
    tfidfs = _to_tfidf(term_frequencies,
                       analyzers.sum(count_docs_with_term_column,
                                     reduce_instance_dims=False),
                       analyzers.sum(sizes),
                       smooth)
    return _split_tfidfs_to_outputs(tfidfs)
Beispiel #2
0
def tfidf_weights(x, vocab_size):
    """Maps the terms in x to their (1/doc_length) * inverse document frequency.

  Args:
    x: A `SparseTensor` representing int64 values (most likely that are the
        result of calling string_to_int on a tokenized string).
    vocab_size: An int - the count of vocab used to turn the string into int64s
        including any OOV buckets.

  Returns:
    A `SparseTensor` where each int value is mapped to a double equal to
    (1 if that term appears in that row, 0 otherwise / the number of terms in
    that row) * the log of (the number of rows in `x` / (1 + the number of
    rows in `x` where the term appears at least once))

  NOTE:
    This is intented to be used with the feature_column 'sum' combiner to arrive
    at the true term frequncies.
  """
    def _to_vocab_range(x):
        """Enforces that the vocab_ids in x are positive."""
        return tf.SparseTensor(indices=x.indices,
                               values=tf.mod(x.values, vocab_size),
                               dense_shape=x.dense_shape)

    def _to_doc_contains_term(x):
        """Creates a SparseTensor with 1s at every doc/term pair index.

    Args:
      x : a SparseTensor of int64 representing string indices in vocab.

    Returns:
      a SparseTensor with 1s at indices <doc_index_in_batch>,
          <term_index_in_vocab> for every term/doc pair.
    """
        # Construct intermediary sparse tensor with indices
        # [<doc>, <term_index_in_doc>, <vocab_id>] and tf.ones values.
        split_indices = tf.to_int64(
            tf.split(x.indices, axis=1, num_or_size_splits=2))
        expanded_values = tf.to_int64(tf.expand_dims(x.values, 1))
        next_index = tf.concat(
            [split_indices[0], split_indices[1], expanded_values], axis=1)

        next_values = tf.ones_like(x.values)
        vocab_size_as_tensor = tf.constant([vocab_size], dtype=tf.int64)
        next_shape = tf.concat([x.dense_shape, vocab_size_as_tensor], 0)

        next_tensor = tf.SparseTensor(indices=tf.to_int64(next_index),
                                      values=next_values,
                                      dense_shape=next_shape)

        # Take the intermediar tensor and reduce over the term_index_in_doc
        # dimension. This produces a tensor with indices [<doc_id>, <term_id>]
        # and values [count_of_term_in_doc] and shape batch x vocab_size
        term_count_per_doc = tf.sparse_reduce_sum_sparse(next_tensor, 1)

        one_if_doc_contains_term = tf.SparseTensor(
            indices=term_count_per_doc.indices,
            values=tf.to_double(tf.greater(term_count_per_doc.values, 0)),
            dense_shape=term_count_per_doc.dense_shape)

        return one_if_doc_contains_term

    def _to_idf_over_doc_size(x, reduced_term_freq, corpus_size):
        """Calculates the inverse document frequency of terms in the corpus.

    Args:
      x : a `SparseTensor` of int64 representing string indices in vocab.
      reduced_term_freq: A `Tensor` of shape (vocabSize,) that represents the
          count of the number of documents with each term.
      corpus_size: A scalar count of the number of documents in the corpus

    Returns:
      The tf*idf values
    """
        # Add one to the reduced term freqnencies to avoid dividing by zero.
        idf = tf.log(
            tf.to_double(corpus_size) /
            (1.0 + tf.to_double(reduced_term_freq)))

        dense_doc_sizes = tf.to_double(
            tf.sparse_reduce_sum(
                tf.SparseTensor(indices=x.indices,
                                values=tf.ones_like(x.values),
                                dense_shape=x.dense_shape), 1))

        # For every term in x, divide the idf by the doc size.
        # The two gathers both result in shape <sum_doc_sizes>
        idf_over_doc_size = (tf.gather(idf, x.values) /
                             tf.gather(dense_doc_sizes, x.indices[:, 0]))

        return tf.SparseTensor(indices=x.indices,
                               values=tf.to_float(idf_over_doc_size),
                               dense_shape=x.dense_shape)

    cleaned_input = _to_vocab_range(x)

    docs_with_terms = _to_doc_contains_term(cleaned_input)

    def count_docs_with_term(term_frequency):
        # Sum w/in batch.
        count_of_doc_inter = tf.SparseTensor(
            indices=term_frequency.indices,
            values=tf.ones_like(term_frequency.values),
            dense_shape=term_frequency.dense_shape)
        out = tf.sparse_reduce_sum(count_of_doc_inter, axis=0)
        return tf.expand_dims(out, 0)

    count_docs_with_term_column = count_docs_with_term(docs_with_terms)
    # Expand dims to get around the min_tensor_rank checks
    sizes = tf.expand_dims(tf.shape(cleaned_input)[0], 0)
    return _to_idf_over_doc_size(
        cleaned_input,
        analyzers.sum(count_docs_with_term_column, reduce_instance_dims=False),
        analyzers.sum(sizes))