/
mlda.py
167 lines (135 loc) · 5.45 KB
/
mlda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import logging
import numpy as np
import scipy as sp
import utils
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class MLDA:
def __init__(self, n_topic: int, n_iter=2000, alpha=0.1, beta=0.01,
random_state=None, refresh=10):
"""
:param n_topic: 主题数目
:param n_iter: 迭代次数
:param alpha: 文档主题分布超参数
:param beta: 主题单词分布超参数
:param random_state: 随机种子
:param refresh: 循环多少次输出当前日志
"""
self.n_topic = n_topic
self.n_iter = n_iter
self.alpha = alpha
self.beta = beta
# if random_state is None, check_random_state(None) does nothing
self.random_state = random_state
self.refresh = refresh
self.topic_word_ = None
self.doc_topic_ = None
self.nzt_ = None
if alpha <= 0 or beta <= 0:
raise ValueError('alpha and eta must be greater than zero')
# random number that are reused
rng = utils.check_random_state(random_state)
self._rands = rng.rand(1024 ** 2 // 8) # 1MiB of random variates
def fit(self, corpus):
"""
:param corpus: array-like, shape (n_samples, n_features)
Training data, where n_samples in the number of samples
and n_features is the number of features. Sparse matrix allowed.
:return:
"""
random_state = utils.check_random_state(self.random_state)
rands = self._rands.copy()
self._initialize(corpus) # 初始化所有有关信息
for n_iter in range(self.n_iter):
random_state.shuffle(rands)
if n_iter % self.refresh == 0:
pp = self.perplexity(corpus)
logger.info('<{}> log likelihood: {:.0f}'.format(n_iter, pp))
self._sample_topics(rands)
pp = self.perplexity(corpus)
logger.info('<{}> log likelihood: {:.0f}'.format(self.n_iter, pp))
# 计算文档主题分布和主题单词分布
self._count_distribution()
# 删除计算过程中的中间值,以节约空间
del self.TS
del self.DS
del self.ZS
del self.ndz_
return self
def _initialize(self, X: np.array):
n_doc, n_term = X.shape
n_word = int(X.sum())
n_topics = self.n_topic
n_iter = self.n_iter
logger.info('n_documents: {}'.format(n_doc))
logger.info('n_terms: {}'.format(n_term))
logger.info('n_words: {}'.format(n_word))
logger.info('n_topics: {}'.format(n_topics))
logger.info('n_iter: {}'.format(n_iter))
self.nzt_ = nzt_ = np.zeros(
(n_topics, n_term), dtype=np.int) # 主题单词统计量
self.ndz_ = ndz_ = np.zeros((n_doc, n_topics), dtype=np.int) # 文档主题统计量
self.nz_ = nz_ = np.zeros(n_topics, dtype=np.int) # 统计主题出现总数
self.n_word = n_word # 统计单词总数
# TS, DS. ZS 分别表示单词(Term)标号集合,文档标号集合,赋予主题标号集合
self.TS, self.DS = TS, DS = utils.matrix_to_lists(X)
self.ZS = ZS = np.random.randint(n_topics, size=n_word)
for i in range(n_word):
t, d, z = TS[i], DS[i], ZS[i]
nzt_[z, t] += 1
ndz_[d, z] += 1
nz_[z] += 1
def perplexity(self, X: np.array):
"""
calculate perplexity the formulate use
p(W|M)
"""
self._count_distribution() # 首先计算当前循环对应的两分布
perplexity = np.dot(self.doc_topic_, self.topic_word_)
perplexity = np.log(perplexity)
perplexity = np.sum(perplexity * X) / self.n_word
return perplexity
def _count_distribution(self):
"""
计算当前循环文档主题分布、主题单词分布
:return: 运算结果存储在类中
"""
self.doc_topic_ = (self.ndz_ + self.alpha).astype(float)
self.doc_topic_ /= np.sum(self.doc_topic_, axis=1)[:, np.newaxis]
self.topic_word_ = (self.nzw_ + self.beta).astype(float)
self.topic_word_ /= np.sum(self.topic_word_, axis=1)[:, np.newaxis]
def _sample_topics(self, rands):
"""
sample topics base on store information
:param rands 存储的随机数
"""
n_topics, n_term = self.nzt_.shape
n_word = self.ZS.size
TS = self.TS
ZS = self.ZS
DS = self.DS
nzt_ = self.nzt_
nz_ = self.nz_
ndz_ = self.ndz_
beta = self.beta
alpha = self.alpha
n_rand = rands.size
for i in range(n_word):
d, z, t = DS[i], ZS[i], TS[i]
nzt_[z, t] -= 1
nz_[z] -= 1
ndz_[d, z] -= 1
# 计算采样的分布
p = (nzt_[:, t] + beta) / (nz_ + n_term * beta) * (ndz_[d] + alpha)
p_sum = np.array([p[0: col + 1] for col in range(p.size)])
r = rands(i % n_rand) * p_sum[-1]
new_z = np.searchsorted(p_sum, r)
# 根据新主题,统计相关信息
nzt_[new_z, t] += 1
nz_[new_z] += 1
ndz_[d, new_z] += 1
if __name__ == '__main__':
model = MLDA(n_topic=20, random_state=10)
corpus = np.array([[1, 3, 4, 5, 5], [3, 0, 0, 1, 2]], dtype=np.int)
model.fit(corpus)