def _initialize(self, documents): # change words into indices (documents, word_index) = reindex(documents) # figure out number of topics n_topics = (self.n_topics if self.n_topics is not None else len( self.doc_topic_prior)) n_words = len(word_index) n_docs = len(documents) # build a doc-word count matrix doc_word_counts = np.zeros((n_docs, n_words)) for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): doc_word_counts[d, word] += 1 # build doc-topic and topic-word priors if isinstance(self.doc_topic_prior, Number): concentration = self.doc_topic_prior base = np.ones(n_topics) / n_topics else: concentration = 1.0 base = self.doc_topic_prior doc_topic_prior = concentration * base if isinstance(self.topic_word_prior, Number): concentration = self.topic_word_prior base = np.ones(n_words) / n_words else: concentration = 1.0 base = self.topic_word_prior topic_word_prior = concentration * base return (doc_word_counts, doc_topic_prior, topic_word_prior, word_index)
def infer(self, documents): # turn words into integer indices (documents, word_index) = reindex(documents) n_docs = len(documents) n_words = len(word_index) n_topics = self.n_topics # create document-word count matrix counts = doc_word_matrix(documents, n_words) # run SVD (U, S, Vt) = linalg.svds(counts, k=n_topics) return { 'topic_word': Vt, 'doc_topic': U, 'topic_weights': S, 'word_index': word_index }
def _initialize(self, documents): # change words into indices (documents, word_index) = reindex(documents) # figure out number of topics n_topics = ( self.n_topics if self.n_topics is not None else len(self.doc_topic_prior) ) n_words = len(word_index) n_docs = len(documents) # build a doc-word count matrix doc_word_counts = np.zeros( (n_docs, n_words) ) for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): doc_word_counts[d,word] += 1 # build doc-topic and topic-word priors if isinstance(self.doc_topic_prior, Number): concentration = self.doc_topic_prior base = np.ones(n_topics) / n_topics else: concentration = 1.0 base = self.doc_topic_prior doc_topic_prior = concentration * base if isinstance(self.topic_word_prior, Number): concentration = self.topic_word_prior base = np.ones(n_words) / n_words else: concentration = 1.0 base = self.topic_word_prior topic_word_prior = concentration * base return (doc_word_counts, doc_topic_prior, topic_word_prior, word_index)
def infer(self, documents, n_sweeps=5): # reindex words (documents, word_index) = reindex(documents) # initialize commonly used numbers n_docs = len(documents) n_words = len(word_index) n_topics = self.n_topics r = np.random.RandomState(0) # 1. Calculate moments (defer third till later) self.logger.debug("Constructing 1st and 2nd moments") m1 = self.moment1(n_words, documents) m2 = self.moment2(n_words, documents) # 2. Whiten self.logger.debug("Doing first SVD") pairs = self.pairs(n_words, documents, m1=m1, m2=m2) (A, Sigma, _) = linalg.svd(pairs) A = A[:, 0:n_topics] # first k singular vectors Sigma = np.diag(Sigma[0:n_topics]) # first k singular values W = A.dot(np.sqrt(Sigma)) # 3. SVD # # SVD via random projection # self.logger.debug("Constructing 3rd moment") # axis = r.randn(n_topics); axis /= linalg.norm(axis) # random unit norm vector # triples = self.triples(n_words, documents, W.dot(axis), m1=m1, m2=m2) # self.logger.debug("Performing second SVD") # V = linalg.svd(W.T.dot(triples).dot(W))[0] # columns are left singular vectors # SVD via power method self.logger.debug("Starting power iterations") V = r.randn(n_topics, n_topics) # initialize an orthonormal basis V = linalg.orth(V) for iteration in range(n_sweeps): self.logger.debug("iteration %d" % (iteration, )) for t in range(n_topics): Wv = W.dot(V[:, t]) triples = self.triples(n_words, documents, Wv, m1=m1, m2=m2) V[:, t] = W.T.dot(triples).dot(Wv) V = linalg.orth(V) # 4. Reconstruct and Normalize self.logger.debug("Reconstructing topic-word vectors") W_inv = linalg.pinv(W) O = np.zeros((n_words, n_topics)) for t in range(n_topics): O[:, t] = W_inv.T.dot(V[:, t]) # change sign of singular vector i = np.argmax(np.abs(O[:, t])) O[:, t] = np.sign(O[i, t]) * O[:, t] # drop negative components and normalize O[O[:, t] < 0, t] = 0 O[:, t] /= linalg.norm(O[:, t], 1) return { 'topic_word': O.T, # each row is a topic 'word_index': word_index, }
def infer(self, documents, n_sweeps=1000, word_topic=None): r = np.random.RandomState(0) # initialize counts for each doc-topic and topic-word pair using the prior (doc_topic_counts, topic_word_counts, word_index) = (self._initialize(documents)) topic_counts = np.sum(topic_word_counts, axis=1) n_topics = topic_word_counts.shape[0] n_docs = doc_topic_counts.shape[0] n_words = len(word_index) # transform documents into lists of word indices (documents, word_index) = reindex(documents) # initialize topics for all words uniformly at random if word_topic is None: word_topic = [[ categorical(np.ones(n_topics) / n_topics, r) for word in doc ] for doc in documents] # initialize doc-topic and topic-word counts for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): # get topic for this word t = word_topic[d][i] # increment counts doc_topic_counts[d, t] += 1 topic_word_counts[t, word] += 1 topic_counts[t] += 1 # resample word topics for sweep in range(n_sweeps): self.logger.debug('starting sweep #%d' % (sweep, )) for (d, doc) in enumerate(documents): if d % 100 == 0: self.logger.debug('starting document #%d' % (d, )) for (i, word) in enumerate(doc): # get topic for this word in this document t = word_topic[d][i] # remove it from counts doc_topic_counts[d, t] -= 1 topic_word_counts[t, word] -= 1 topic_counts[t] -= 1 # calculate P(t | everything else) prob = [ doc_topic_counts[d, t] * topic_word_counts[t, word] / topic_counts[t] for t in range(n_topics) ] prob = np.array(prob) / np.sum(prob) # select topic t = categorical(prob, r) # increment counts doc_topic_counts[d, t] += 1 topic_word_counts[t, word] += 1 topic_counts[t] += 1 # set topic for word word_topic[d][i] = t # sum of counts along each row topic_word_sums = topic_counts[:, np.newaxis] doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis] yield { 'topic_word': np.copy(topic_word_counts) / topic_word_sums, 'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums, 'word_topic': copy.deepcopy(word_topic), 'word_index': word_index }
def infer(self, documents, n_sweeps=5): # reindex words (documents, word_index) = reindex(documents) # initialize commonly used numbers n_docs = len(documents) n_words = len(word_index) n_topics = self.n_topics r = np.random.RandomState(0) # 1. Calculate moments (defer third till later) self.logger.debug("Constructing 1st and 2nd moments") m1 = self.moment1(n_words, documents) m2 = self.moment2(n_words, documents) # 2. Whiten self.logger.debug("Doing first SVD") pairs = self.pairs(n_words, documents, m1=m1, m2=m2) (A, Sigma, _) = linalg.svd(pairs) A = A[:,0:n_topics] # first k singular vectors Sigma = np.diag(Sigma[0:n_topics]) # first k singular values W = A.dot(np.sqrt(Sigma)) # 3. SVD # # SVD via random projection # self.logger.debug("Constructing 3rd moment") # axis = r.randn(n_topics); axis /= linalg.norm(axis) # random unit norm vector # triples = self.triples(n_words, documents, W.dot(axis), m1=m1, m2=m2) # self.logger.debug("Performing second SVD") # V = linalg.svd(W.T.dot(triples).dot(W))[0] # columns are left singular vectors # SVD via power method self.logger.debug("Starting power iterations") V = r.randn(n_topics, n_topics) # initialize an orthonormal basis V = linalg.orth(V) for iteration in range(n_sweeps): self.logger.debug("iteration %d" % (iteration,)) for t in range(n_topics): Wv = W.dot(V[:,t]) triples = self.triples(n_words, documents, Wv, m1=m1, m2=m2) V[:,t] = W.T.dot(triples).dot(Wv) V = linalg.orth(V) # 4. Reconstruct and Normalize self.logger.debug("Reconstructing topic-word vectors") W_inv = linalg.pinv(W) O = np.zeros((n_words, n_topics)) for t in range(n_topics): O[:,t] = W_inv.T.dot(V[:,t]) # change sign of singular vector i = np.argmax(np.abs(O[:,t])) O[:,t] = np.sign(O[i,t]) * O[:,t] # drop negative components and normalize O[O[:,t] < 0,t] = 0 O[:,t] /= linalg.norm(O[:,t], 1) return { 'topic_word': O.T, # each row is a topic 'word_index': word_index, }
def infer(self, documents, n_sweeps=1000, word_topic=None): r = np.random.RandomState(0) # initialize counts for each doc-topic and topic-word pair using the prior (doc_topic_counts, topic_word_counts, word_index) = ( self._initialize(documents) ) topic_counts = np.sum(topic_word_counts, axis=1) n_topics = topic_word_counts.shape[0] n_docs = doc_topic_counts.shape[0] n_words = len(word_index) # transform documents into lists of word indices (documents, word_index) = reindex(documents) # initialize topics for all words uniformly at random if word_topic is None: word_topic = [ [ categorical(np.ones(n_topics)/n_topics, r) for word in doc ] for doc in documents ] # initialize doc-topic and topic-word counts for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): # get topic for this word t = word_topic[d][i] # increment counts doc_topic_counts[d,t] += 1 topic_word_counts[t,word] += 1 topic_counts[t] += 1 # resample word topics for sweep in range(n_sweeps): self.logger.debug('starting sweep #%d' % (sweep,)) for (d, doc) in enumerate(documents): if d % 100 == 0: self.logger.debug('starting document #%d' % (d,)) for (i, word) in enumerate(doc): # get topic for this word in this document t = word_topic[d][i] # remove it from counts doc_topic_counts[d,t] -= 1 topic_word_counts[t,word] -= 1 topic_counts[t] -= 1 # calculate P(t | everything else) prob = [ doc_topic_counts[d,t] * topic_word_counts[t,word] / topic_counts[t] for t in range(n_topics) ] prob = np.array(prob) / np.sum(prob) # select topic t = categorical(prob, r) # increment counts doc_topic_counts[d,t] += 1 topic_word_counts[t,word] += 1 topic_counts[t] += 1 # set topic for word word_topic[d][i] = t # sum of counts along each row topic_word_sums = topic_counts[:, np.newaxis] doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis] yield { 'topic_word': np.copy(topic_word_counts) / topic_word_sums, 'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums, 'word_topic': copy.deepcopy(word_topic), 'word_index': word_index }