def findAnchorTopics_Orig(Data, K=10, loss='L2', seed=0, lowerDim=1000, minDocPerWord=0, eps=1e-4, doRecover=1): """ Estimate and return K topics using anchor word method Returns ------- topics : numpy 2D array, size K x V """ from Q_matrix import generate_Q_matrix from fastRecover import do_recovery params = Params(seed=seed, lowerDim=lowerDim, minDocPerWord=minDocPerWord, eps=eps) assert isinstance(Data, bnpy.data.DataObj) DocWordMat = Data.getSparseDocTypeCountMatrix() if not str(type(DocWordMat)).count('csr_matrix') > 0: raise NotImplementedError('Need CSR matrix') Q = generate_Q_matrix(DocWordMat.copy().T) anchors = selectAnchorWords(DocWordMat.tocsc(), Q, K, params) if doRecover: topics, topic_likelihoods = do_recovery(Q, anchors, loss, params) topics = topics.T topics = topics / topics.sum(axis=1)[:, np.newaxis] return topics else: return Q, anchors
var = np.zeros((M.shape)) row_sums = Q_emp.sum(1) for i in xrange(len(Q_emp[:,0])): Q_bar[i,:] = Q_emp[i,:]/float(row_sums[i]) var[i,:] = var_analytical[i,:]/(float(row_sums[i])**2) # find anchors (anchors, anchor_indices) = gs.Projection_Find(Q_bar, K, candidate_anchors, var) anchor_indices = [int(a) for a in anchor_indices] print "anchors are:" for i, a in enumerate(anchors): print i, vocab[int(a)] # recover topics A, topic_likelihoods, R = do_recovery(Q_emp, anchors, loss, params) print "done recovering" np.savetxt(outfile+".A", A) np.savetxt(outfile+".topic_likelihoods", topic_likelihoods) np.savetxt(outfile+".R", R) # np.savetxt(outfile+".Q", Q) #display f = file(outfile+".topwords", 'w') for k in xrange(K): mask = A[:, k] > 0.01 topwords = [ x for x in np.argsort(A[:, k]) if mask[x] ][::-1] # topwords = np.argsort(A[:, k])[-params.top_words:][::-1] # print params.top_words # print 'npargsort', A[:, k]
vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, K, params, candidate_anchors) print "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] #recover topics A, topic_likelihoods = do_recovery(Q, anchors, loss, params) print "done recovering" np.savetxt(outfile+".A", A) np.savetxt(outfile+".topic_likelihoods", topic_likelihoods) #display f = file(outfile+".topwords", 'w') for k in xrange(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print vocab[anchors[k]], ':', print >>f, vocab[anchors[k]], ':', for w in topwords: print vocab[w], print >>f, vocab[w], print ""
def anchor_words(D, loss='L2', params=config.default_config()): Q = generate_Q_matrix(D * 100) anchors = findAnchors(Q, params['T'], params) W, topic_likelihoods = do_recovery(Q, anchors, loss, params) return W
def run(self): params = self.params if isinstance(params.infile, basestr): M = scipy.io.loadmat(params.infile)['M'] else: M = params.infile assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix" print("Input matrix shape: {}".format(M.shape)) if isinstance(params.vocab_file, basestr): with open(params.vocab_file) as f: vocab = f.read().strip().split() else: vocab = params.vocab_file assert np.iterable(vocab), "Must provide an iterable vocab" assert M.shape[0] == len(vocab), \ "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \ .format(M.shape[0], len(vocab)) #only accept anchors that appear in a significant number of docs print("identifying candidate anchors") candidate_anchors = [] for i in range(M.shape[0]): if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print(len(candidate_anchors), "candidates") #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) # Save copy of unnormalized Q, before any normalizations happen self.Q_unnormalized = Q.copy() #check that Q sum is 1 or close to it print("Q sum is", Q.sum()) V = Q.shape[0] print("done reading documents") #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, params, candidate_anchors) print("anchors are:") for i, a in enumerate(anchors): print(i, vocab[a]) #recover topics A, topic_likelihoods = do_recovery(Q, anchors, params) print("done recovering") output_streams = [sys.stdout] output_file_handle = None if params.outfile is not None: np.savetxt(params.outfile+".A", A) np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods) output_file_handle = open(params.outfile+".topwords", 'w') output_streams.append(output_file_handle) def print_multiple(*args, **kwargs): # Print the same info to multiple output streams for f in output_streams: print(*args, file=f, **kwargs) # Display top words per topic all_topwords = [] for k in range(params.K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print_multiple(vocab[anchors[k]], ':', end=' ') for w in topwords: print_multiple(vocab[w], end=' ') print_multiple("") all_topwords.append(TopWordsSummary( topic_index = k, anchor_word_index = anchors[k], anchor_word = vocab[anchors[k]], top_word_indices = topwords, top_words = [vocab[w] for w in topwords])) if params.outfile is not None: output_file_handle.close() # make some results available as attributes of "self" self.Q = Q self.M = M self.A = A self._R = None self.topic_likelihoods = topic_likelihoods self.candidate_anchors = candidate_anchors self.anchors = anchors self.vocab = vocab self.all_topwords = all_topwords
print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchor_logfile = file(params.log_prefix + '.anchors', 'w') anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile) print "anchors are:" print >> anchor_logfile, "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] print >> anchor_logfile, i, vocab[a] anchor_logfile.close() #recover topics A, topic_likelihoods, objective = do_recovery(Q, anchors, loss, params) print "done recovering" print "avg objective function during recovery using", K, "topics:", objective np.savetxt(outfile + ".A", A) np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods) #display f = file(outfile + ".topwords", 'w') for k in xrange(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print vocab[anchors[k]], ':', print >> f, vocab[anchors[k]], ':', for w in topwords: print vocab[w], print >> f, vocab[w],
print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchor_logfile = file(params.log_prefix+'.anchors', 'w') anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile) print "anchors are:" print >>anchor_logfile, "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] print >>anchor_logfile, i, vocab[a] anchor_logfile.close() #recover topics A, topic_likelihoods,objective = do_recovery(Q, anchors, loss, params) print "done recovering" print "avg objective function during recovery using", K, "topics:", objective np.savetxt(outfile+".A", A) np.savetxt(outfile+".topic_likelihoods", topic_likelihoods) #display f = file(outfile+".topwords", 'w') for k in xrange(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print vocab[anchors[k]], ':', print >>f, vocab[anchors[k]], ':', for w in topwords: print vocab[w], print >>f, vocab[w],
vocab = open(vocab_file).read().strip().split() # check that Q sum is 1 or close to it print("Q sum is", Q.sum()) V = Q.shape[0] print("done reading documents") # find anchors- this step uses a random projection # into low dimensional space anchors = findAnchors(Q, K, params, candidate_anchors) print("anchors are:") for i, a in enumerate(anchors): print(i, vocab[a]) # recover topics A, topic_likelihoods = do_recovery(Q, anchors, loss, params) print("done recovering") np.savetxt(outfile + ".A", A) np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods) # display with open(outfile + ".topwords", "w") as f: for k in range(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print(vocab[anchors[k]], ":", end=" ") print(vocab[anchors[k]], ":", end=" ", file=f) for w in topwords: print(vocab[w], end=" ") print(vocab[w], end=" ", file=f) print("")