-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_topics2.py
90 lines (64 loc) · 2.73 KB
/
get_topics2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''
Topic modeling method to learn the topic and generate sparse code
Xiyang
'''
from LLDA import *
import numpy as np
import cPickle as pickle
class get_topics2(object):
def __init__(self, n_topics=2, alpha=0.001, beta=0.001, max_iter=100, method='LLDA'):
self.method = method.lower()
self.n_topics = n_topics
self.alpha = alpha
self.beta = beta
self.iteration = max_iter
assert self.method=='llda', 'Only Labeled LDA method is implemented in thi library'
self.model = LLDA(self.n_topics, self.alpha, self.beta)
def fit(self, X, y=None, verbose=False):
'''
X: Gensim corpora form
# Gensim treat corpus input as chunks of lists
'''
assert isinstance(X[0], list), 'Not chuncks of lists for docs. Require [[doc1],[doc2],...] as gensim chunk format'
if y is not None:
assert isinstance(y[0], list), 'Not chuncks of lists for labels. Require [[label1],[label2],...] as gensim chunk format'
self.labelset = list(set(reduce(list.__add__, y)))
self.model.set_corpus(self.labelset, X, y)
for i in range(self.iteration):
if verbose:
print "-- %d : %.4f" % (i, self.model.perplexity())
self.model.inference()
if verbose:
print "Final perplexity : %.4f" % self.model.perplexity()
def load(self, filename):
with open(filename, 'rb') as input:
self.model = pickle.load(input)
def save(self, filename):
with open(filename, 'wb+') as output:
## save a class object to a file using pickle
pickle.dump(self.model, output, pickle.HIGHEST_PROTOCOL)
def encode(self, X, topk=20, normalize=True):
'''
# Gensim treat corpus input as chunks of lists
Result is normalized any way.
'''
assert isinstance(X[0], list), 'Not chuncks of lists. Require [[],[],...] as gensim chunk format'
docs = [[self.model.term_to_id(term) for term in doc] for doc in X]
phi = self.model.phi()
thetas = self.model.theta()
hist = np.zeros(self.n_topics)
for doc, theta in zip(docs, thetas):
for w in doc:
hist += phi[:,w] * theta
if normalize:
hist = hist/(np.sum(hist, axis=0) + 1e-6)
sort_ind = np.argsort(hist)[::-1] # reverse index sequence after argsort
hist[sort_ind[topk:]] = 0
return hist
def summary(self, n_topics=-1):
phi = self.model.phi()
for k in range(np.size(phi, axis=0)):
print "\n%d: " % k,
for w in numpy.argsort(-phi[k])[:20]:
print "+ %.4f*%s" % (phi[k,w], self.model.vocas[w]),
print ''