Exemple #1
0
    def _initialize(self, documents):
        # change words into indices
        (documents, word_index) = reindex(documents)

        # figure out number of topics
        n_topics = (self.n_topics if self.n_topics is not None else len(
            self.doc_topic_prior))
        n_words = len(word_index)
        n_docs = len(documents)

        # build a doc-word count matrix
        doc_word_counts = np.zeros((n_docs, n_words))
        for (d, doc) in enumerate(documents):
            for (i, word) in enumerate(doc):
                doc_word_counts[d, word] += 1

        # build doc-topic and topic-word priors
        if isinstance(self.doc_topic_prior, Number):
            concentration = self.doc_topic_prior
            base = np.ones(n_topics) / n_topics
        else:
            concentration = 1.0
            base = self.doc_topic_prior
        doc_topic_prior = concentration * base

        if isinstance(self.topic_word_prior, Number):
            concentration = self.topic_word_prior
            base = np.ones(n_words) / n_words
        else:
            concentration = 1.0
            base = self.topic_word_prior
        topic_word_prior = concentration * base

        return (doc_word_counts, doc_topic_prior, topic_word_prior, word_index)
Exemple #2
0
  def infer(self, documents):
    # turn words into integer indices
    (documents, word_index) = reindex(documents)
    n_docs = len(documents)
    n_words = len(word_index)
    n_topics = self.n_topics

    # create document-word count matrix
    counts = doc_word_matrix(documents, n_words)

    # run SVD
    (U, S, Vt) = linalg.svds(counts, k=n_topics)

    return {
        'topic_word': Vt,
        'doc_topic': U,
        'topic_weights': S,
        'word_index': word_index
    }
Exemple #3
0
    def infer(self, documents):
        # turn words into integer indices
        (documents, word_index) = reindex(documents)
        n_docs = len(documents)
        n_words = len(word_index)
        n_topics = self.n_topics

        # create document-word count matrix
        counts = doc_word_matrix(documents, n_words)

        # run SVD
        (U, S, Vt) = linalg.svds(counts, k=n_topics)

        return {
            'topic_word': Vt,
            'doc_topic': U,
            'topic_weights': S,
            'word_index': word_index
        }
Exemple #4
0
  def _initialize(self, documents):
    # change words into indices
    (documents, word_index) = reindex(documents)

    # figure out number of topics
    n_topics = (
        self.n_topics
        if self.n_topics is not None
        else len(self.doc_topic_prior)
    )
    n_words = len(word_index)
    n_docs = len(documents)

    # build a doc-word count matrix
    doc_word_counts = np.zeros( (n_docs, n_words) )
    for (d, doc) in enumerate(documents):
      for (i, word) in enumerate(doc):
        doc_word_counts[d,word] += 1

    # build doc-topic and topic-word priors
    if isinstance(self.doc_topic_prior, Number):
      concentration = self.doc_topic_prior
      base = np.ones(n_topics) / n_topics
    else:
      concentration = 1.0
      base = self.doc_topic_prior
    doc_topic_prior = concentration * base

    if isinstance(self.topic_word_prior, Number):
      concentration = self.topic_word_prior
      base = np.ones(n_words) / n_words
    else:
      concentration = 1.0
      base = self.topic_word_prior
    topic_word_prior = concentration * base

    return (doc_word_counts, doc_topic_prior, topic_word_prior, word_index)
Exemple #5
0
    def infer(self, documents, n_sweeps=5):
        # reindex words
        (documents, word_index) = reindex(documents)

        # initialize commonly used numbers
        n_docs = len(documents)
        n_words = len(word_index)
        n_topics = self.n_topics
        r = np.random.RandomState(0)

        # 1. Calculate moments (defer third till later)
        self.logger.debug("Constructing 1st and 2nd moments")
        m1 = self.moment1(n_words, documents)
        m2 = self.moment2(n_words, documents)

        # 2. Whiten
        self.logger.debug("Doing first SVD")
        pairs = self.pairs(n_words, documents, m1=m1, m2=m2)
        (A, Sigma, _) = linalg.svd(pairs)
        A = A[:, 0:n_topics]  # first k singular vectors
        Sigma = np.diag(Sigma[0:n_topics])  # first k singular values
        W = A.dot(np.sqrt(Sigma))

        # 3. SVD

        # # SVD via random projection
        # self.logger.debug("Constructing 3rd moment")
        # axis = r.randn(n_topics); axis /= linalg.norm(axis)  # random unit norm vector
        # triples = self.triples(n_words, documents, W.dot(axis), m1=m1, m2=m2)
        # self.logger.debug("Performing second SVD")
        # V = linalg.svd(W.T.dot(triples).dot(W))[0]   # columns are left singular vectors

        # SVD via power method
        self.logger.debug("Starting power iterations")
        V = r.randn(n_topics, n_topics)  # initialize an orthonormal basis
        V = linalg.orth(V)
        for iteration in range(n_sweeps):
            self.logger.debug("iteration %d" % (iteration, ))
            for t in range(n_topics):
                Wv = W.dot(V[:, t])
                triples = self.triples(n_words, documents, Wv, m1=m1, m2=m2)
                V[:, t] = W.T.dot(triples).dot(Wv)
            V = linalg.orth(V)

        # 4. Reconstruct and Normalize
        self.logger.debug("Reconstructing topic-word vectors")
        W_inv = linalg.pinv(W)
        O = np.zeros((n_words, n_topics))
        for t in range(n_topics):
            O[:, t] = W_inv.T.dot(V[:, t])

            # change sign of singular vector
            i = np.argmax(np.abs(O[:, t]))
            O[:, t] = np.sign(O[i, t]) * O[:, t]

            # drop negative components and normalize
            O[O[:, t] < 0, t] = 0
            O[:, t] /= linalg.norm(O[:, t], 1)

        return {
            'topic_word': O.T,  # each row is a topic
            'word_index': word_index,
        }
Exemple #6
0
    def infer(self, documents, n_sweeps=1000, word_topic=None):
        r = np.random.RandomState(0)

        # initialize counts for each doc-topic and topic-word pair using the prior
        (doc_topic_counts, topic_word_counts,
         word_index) = (self._initialize(documents))
        topic_counts = np.sum(topic_word_counts, axis=1)
        n_topics = topic_word_counts.shape[0]
        n_docs = doc_topic_counts.shape[0]
        n_words = len(word_index)

        # transform documents into lists of word indices
        (documents, word_index) = reindex(documents)

        # initialize topics for all words uniformly at random
        if word_topic is None:
            word_topic = [[
                categorical(np.ones(n_topics) / n_topics, r) for word in doc
            ] for doc in documents]

        # initialize doc-topic and topic-word counts
        for (d, doc) in enumerate(documents):
            for (i, word) in enumerate(doc):
                # get topic for this word
                t = word_topic[d][i]

                # increment counts
                doc_topic_counts[d, t] += 1
                topic_word_counts[t, word] += 1
                topic_counts[t] += 1

        # resample word topics
        for sweep in range(n_sweeps):
            self.logger.debug('starting sweep #%d' % (sweep, ))
            for (d, doc) in enumerate(documents):

                if d % 100 == 0:
                    self.logger.debug('starting document #%d' % (d, ))

                for (i, word) in enumerate(doc):
                    # get topic for this word in this document
                    t = word_topic[d][i]

                    # remove it from counts
                    doc_topic_counts[d, t] -= 1
                    topic_word_counts[t, word] -= 1
                    topic_counts[t] -= 1

                    # calculate P(t | everything else)
                    prob = [
                        doc_topic_counts[d, t] * topic_word_counts[t, word] /
                        topic_counts[t] for t in range(n_topics)
                    ]
                    prob = np.array(prob) / np.sum(prob)

                    # select topic
                    t = categorical(prob, r)

                    # increment counts
                    doc_topic_counts[d, t] += 1
                    topic_word_counts[t, word] += 1
                    topic_counts[t] += 1

                    # set topic for word
                    word_topic[d][i] = t

            # sum of counts along each row
            topic_word_sums = topic_counts[:, np.newaxis]
            doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis]

            yield {
                'topic_word': np.copy(topic_word_counts) / topic_word_sums,
                'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums,
                'word_topic': copy.deepcopy(word_topic),
                'word_index': word_index
            }
Exemple #7
0
  def infer(self, documents, n_sweeps=5):
    # reindex words
    (documents, word_index) = reindex(documents)

    # initialize commonly used numbers
    n_docs = len(documents)
    n_words = len(word_index)
    n_topics = self.n_topics
    r = np.random.RandomState(0)

    # 1. Calculate moments (defer third till later)
    self.logger.debug("Constructing 1st and 2nd moments")
    m1 = self.moment1(n_words, documents)
    m2 = self.moment2(n_words, documents)

    # 2. Whiten
    self.logger.debug("Doing first SVD")
    pairs = self.pairs(n_words, documents, m1=m1, m2=m2)
    (A, Sigma, _) = linalg.svd(pairs)
    A = A[:,0:n_topics]                 # first k singular vectors
    Sigma = np.diag(Sigma[0:n_topics])  # first k singular values
    W = A.dot(np.sqrt(Sigma))

    # 3. SVD

    # # SVD via random projection
    # self.logger.debug("Constructing 3rd moment")
    # axis = r.randn(n_topics); axis /= linalg.norm(axis)  # random unit norm vector
    # triples = self.triples(n_words, documents, W.dot(axis), m1=m1, m2=m2)
    # self.logger.debug("Performing second SVD")
    # V = linalg.svd(W.T.dot(triples).dot(W))[0]   # columns are left singular vectors

    # SVD via power method
    self.logger.debug("Starting power iterations")
    V = r.randn(n_topics, n_topics)  # initialize an orthonormal basis
    V = linalg.orth(V)
    for iteration in range(n_sweeps):
      self.logger.debug("iteration %d" % (iteration,))
      for t in range(n_topics):
        Wv = W.dot(V[:,t])
        triples = self.triples(n_words, documents, Wv, m1=m1, m2=m2)
        V[:,t] = W.T.dot(triples).dot(Wv)
      V = linalg.orth(V)

    # 4. Reconstruct and Normalize
    self.logger.debug("Reconstructing topic-word vectors")
    W_inv = linalg.pinv(W)
    O = np.zeros((n_words, n_topics))
    for t in range(n_topics):
      O[:,t] = W_inv.T.dot(V[:,t])

      # change sign of singular vector
      i = np.argmax(np.abs(O[:,t]))
      O[:,t] = np.sign(O[i,t]) * O[:,t]

      # drop negative components and normalize
      O[O[:,t] < 0,t] = 0
      O[:,t] /= linalg.norm(O[:,t], 1)

    return {
      'topic_word': O.T,  # each row is a topic
      'word_index': word_index,
    }
Exemple #8
0
  def infer(self, documents, n_sweeps=1000, word_topic=None):
    r = np.random.RandomState(0)

    # initialize counts for each doc-topic and topic-word pair using the prior
    (doc_topic_counts, topic_word_counts, word_index) = (
      self._initialize(documents)
    )
    topic_counts = np.sum(topic_word_counts, axis=1)
    n_topics = topic_word_counts.shape[0]
    n_docs = doc_topic_counts.shape[0]
    n_words = len(word_index)

    # transform documents into lists of word indices
    (documents, word_index) = reindex(documents)

    # initialize topics for all words uniformly at random
    if word_topic is None:
      word_topic = [
        [
          categorical(np.ones(n_topics)/n_topics, r)
          for word in doc
        ]
        for doc in documents
      ]

    # initialize doc-topic and topic-word counts
    for (d, doc) in enumerate(documents):
      for (i, word) in enumerate(doc):
        # get topic for this word
        t = word_topic[d][i]

        # increment counts
        doc_topic_counts[d,t] += 1
        topic_word_counts[t,word] += 1
        topic_counts[t] += 1

    # resample word topics
    for sweep in range(n_sweeps):
      self.logger.debug('starting sweep #%d' % (sweep,))
      for (d, doc) in enumerate(documents):

        if d % 100 == 0:
          self.logger.debug('starting document #%d' % (d,))

        for (i, word) in enumerate(doc):
          # get topic for this word in this document
          t = word_topic[d][i]

          # remove it from counts
          doc_topic_counts[d,t] -= 1
          topic_word_counts[t,word] -= 1
          topic_counts[t] -= 1

          # calculate P(t | everything else)
          prob = [
              doc_topic_counts[d,t] * topic_word_counts[t,word] / topic_counts[t]
              for t in range(n_topics)
          ]
          prob = np.array(prob) / np.sum(prob)

          # select topic
          t = categorical(prob, r)

          # increment counts
          doc_topic_counts[d,t] += 1
          topic_word_counts[t,word] += 1
          topic_counts[t] += 1

          # set topic for word
          word_topic[d][i] = t

      # sum of counts along each row
      topic_word_sums = topic_counts[:, np.newaxis]
      doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis]

      yield {
        'topic_word': np.copy(topic_word_counts) / topic_word_sums,
        'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums,
        'word_topic': copy.deepcopy(word_topic),
        'word_index': word_index
      }