class get_topics2(object): def __init__(self, n_topics=2, alpha=0.001, beta=0.001, max_iter=100, method='LLDA'): self.method = method.lower() self.n_topics = n_topics self.alpha = alpha self.beta = beta self.iteration = max_iter assert self.method=='llda', 'Only Labeled LDA method is implemented in thi library' self.model = LLDA(self.n_topics, self.alpha, self.beta) def fit(self, X, y=None, verbose=False): ''' X: Gensim corpora form # Gensim treat corpus input as chunks of lists ''' assert isinstance(X[0], list), 'Not chuncks of lists for docs. Require [[doc1],[doc2],...] as gensim chunk format' if y is not None: assert isinstance(y[0], list), 'Not chuncks of lists for labels. Require [[label1],[label2],...] as gensim chunk format' self.labelset = list(set(reduce(list.__add__, y))) self.model.set_corpus(self.labelset, X, y) for i in range(self.iteration): if verbose: print "-- %d : %.4f" % (i, self.model.perplexity()) self.model.inference() if verbose: print "Final perplexity : %.4f" % self.model.perplexity() def load(self, filename): with open(filename, 'rb') as input: self.model = pickle.load(input) def save(self, filename): with open(filename, 'wb+') as output: ## save a class object to a file using pickle pickle.dump(self.model, output, pickle.HIGHEST_PROTOCOL) def encode(self, X, topk=20, normalize=True): ''' # Gensim treat corpus input as chunks of lists Result is normalized any way. ''' assert isinstance(X[0], list), 'Not chuncks of lists. Require [[],[],...] as gensim chunk format' docs = [[self.model.term_to_id(term) for term in doc] for doc in X] phi = self.model.phi() thetas = self.model.theta() hist = np.zeros(self.n_topics) for doc, theta in zip(docs, thetas): for w in doc: hist += phi[:,w] * theta if normalize: hist = hist/(np.sum(hist, axis=0) + 1e-6) sort_ind = np.argsort(hist)[::-1] # reverse index sequence after argsort hist[sort_ind[topk:]] = 0 return hist def summary(self, n_topics=-1): phi = self.model.phi() for k in range(np.size(phi, axis=0)): print "\n%d: " % k, for w in numpy.argsort(-phi[k])[:20]: print "+ %.4f*%s" % (phi[k,w], self.model.vocas[w]), print ''