def selectAnchorWords(DocWordMat, Q, K, params): from anchors import findAnchors if not str(type(DocWordMat)).count('csc_matrix') > 0: raise NotImplementedError('Need CSC matrix') nDocsPerWord = np.diff(DocWordMat.indptr) candidateWords = np.flatnonzero(nDocsPerWord > params.minDocPerWord) anchors = findAnchors(Q, K, params, candidateWords.tolist()) return anchors
print len(candidate_anchors), "candidates" #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, K, params, candidate_anchors) print "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] #recover topics A, topic_likelihoods = do_recovery(Q, anchors, loss, params) print "done recovering" np.savetxt(outfile+".A", A) np.savetxt(outfile+".topic_likelihoods", topic_likelihoods) #display f = file(outfile+".topwords", 'w') for k in xrange(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
def anchor_words(D, loss='L2', params=config.default_config()): Q = generate_Q_matrix(D * 100) anchors = findAnchors(Q, params['T'], params) W, topic_likelihoods = do_recovery(Q, anchors, loss, params) return W
def run(self): params = self.params if isinstance(params.infile, basestr): M = scipy.io.loadmat(params.infile)['M'] else: M = params.infile assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix" print("Input matrix shape: {}".format(M.shape)) if isinstance(params.vocab_file, basestr): with open(params.vocab_file) as f: vocab = f.read().strip().split() else: vocab = params.vocab_file assert np.iterable(vocab), "Must provide an iterable vocab" assert M.shape[0] == len(vocab), \ "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \ .format(M.shape[0], len(vocab)) #only accept anchors that appear in a significant number of docs print("identifying candidate anchors") candidate_anchors = [] for i in range(M.shape[0]): if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print(len(candidate_anchors), "candidates") #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) # Save copy of unnormalized Q, before any normalizations happen self.Q_unnormalized = Q.copy() #check that Q sum is 1 or close to it print("Q sum is", Q.sum()) V = Q.shape[0] print("done reading documents") #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, params, candidate_anchors) print("anchors are:") for i, a in enumerate(anchors): print(i, vocab[a]) #recover topics A, topic_likelihoods = do_recovery(Q, anchors, params) print("done recovering") output_streams = [sys.stdout] output_file_handle = None if params.outfile is not None: np.savetxt(params.outfile+".A", A) np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods) output_file_handle = open(params.outfile+".topwords", 'w') output_streams.append(output_file_handle) def print_multiple(*args, **kwargs): # Print the same info to multiple output streams for f in output_streams: print(*args, file=f, **kwargs) # Display top words per topic all_topwords = [] for k in range(params.K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print_multiple(vocab[anchors[k]], ':', end=' ') for w in topwords: print_multiple(vocab[w], end=' ') print_multiple("") all_topwords.append(TopWordsSummary( topic_index = k, anchor_word_index = anchors[k], anchor_word = vocab[anchors[k]], top_word_indices = topwords, top_words = [vocab[w] for w in topwords])) if params.outfile is not None: output_file_handle.close() # make some results available as attributes of "self" self.Q = Q self.M = M self.A = A self._R = None self.topic_likelihoods = topic_likelihoods self.candidate_anchors = candidate_anchors self.anchors = anchors self.vocab = vocab self.all_topwords = all_topwords
#forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchor_logfile = file(params.log_prefix + '.anchors', 'w') anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile) print "anchors are:" print >> anchor_logfile, "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] print >> anchor_logfile, i, vocab[a] anchor_logfile.close() #recover topics A, topic_likelihoods, objective = do_recovery(Q, anchors, loss, params) print "done recovering" print "avg objective function during recovery using", K, "topics:", objective np.savetxt(outfile + ".A", A) np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods)
#forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchor_logfile = file(params.log_prefix+'.anchors', 'w') anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile) print "anchors are:" print >>anchor_logfile, "anchors are:" for i, a in enumerate(anchors): print i, vocab[a] print >>anchor_logfile, i, vocab[a] anchor_logfile.close() #recover topics A, topic_likelihoods,objective = do_recovery(Q, anchors, loss, params) print "done recovering" print "avg objective function during recovery using", K, "topics:", objective np.savetxt(outfile+".A", A) np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)
print(len(candidate_anchors), "candidates") # forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = open(vocab_file).read().strip().split() # check that Q sum is 1 or close to it print("Q sum is", Q.sum()) V = Q.shape[0] print("done reading documents") # find anchors- this step uses a random projection # into low dimensional space anchors = findAnchors(Q, K, params, candidate_anchors) print("anchors are:") for i, a in enumerate(anchors): print(i, vocab[a]) # recover topics A, topic_likelihoods = do_recovery(Q, anchors, loss, params) print("done recovering") np.savetxt(outfile + ".A", A) np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods) # display with open(outfile + ".topwords", "w") as f: for k in range(K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1]