def step(self, d, w): word = self._docs[d]['w'][w] self._remove_word(d, w) q = 0 for k in self._word_topics[word]: q += self._q_cache[d, k]*self._n_kw[k, word] u = random.random()*(self._s + self._r + q) t = None if u < self._s: # uniform t = discrete_draw(-np.log(self._cnst), logp=True) elif u < self._s + self._r: # document ks = [k for k in self._doc_topics[d]] logps = np.zeros(len(self._doc_topics[d])) for i, k in enumerate(self._doc_topics[d]): logps[i] = log(self._n_dk[d, k]) + log(self._beta/self._cnst[k]) t = ks[discrete_draw(logps, logp=True)] else: # word ks = [k for k in self._word_topics[word]] logps = np.zeros(len(self._word_topics[word])) for i, k in enumerate(self._word_topics[word]): logps[i] = log(self._n_kw[k, word]) + log(self._q_cache[d, k]) t = ks[discrete_draw(logps, logp=True)] self._insert_word(d, w, t)
def test_discrete_draw_exp_uniform(): u = np.ones(2)/2. x = utils.discrete_draw(u, n=N_DRAWS) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_log_uniform(): u = np.ones(2) / 2. x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u * N_DRAWS) < .05
def test_discrete_draw_exp_point(): u = np.array([0., 1.]) x = utils.discrete_draw(u, n=N_DRAWS) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert not np.any(x == 0)
def test_discrete_draw_exp_nonuniform(): u = np.array([.2, .8]) x = utils.discrete_draw(u, n=N_DRAWS) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u * N_DRAWS) < .05
def test_discrete_draw_exp_uniform(): u = np.ones(2) / 2. x = utils.discrete_draw(u, n=N_DRAWS) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u * N_DRAWS) < .05
def _forward_sample(ldactr, n_docs, n_topics, n_words, n_words_per_doc, alpha, beta): # docs, _ = gen_docs(n_docs, n_topics, n_words, n_words_per_doc, alpha, beta) # lda = ldactr(docs, n_topics, n_words, alpha, beta, seed) # z, asgn, n_dk, _ = _get_lda_properties(lda) docs = [{'w': [0] * n_words_per_doc} for _ in range(n_docs)] n_dk = np.zeros(( n_docs, n_topics, )) n_kw = np.zeros(( n_topics, n_words, )) asgn = [] for d in range(n_docs): asgn_d = [] theta_d = dirichlet.rvs([alpha] * n_topics)[0] for _ in range(n_words_per_doc): k = discrete_draw(theta_d) asgn_d.append(k) n_dk[d, k] += 1 asgn.append(asgn_d) z = [item for sublist in asgn for item in sublist] docs, phi = _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=True) return docs, z, phi
def test_discrete_draw_exp_nonuniform(): u = np.array([.2, .8]) x = utils.discrete_draw(u, n=N_DRAWS) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_log_uniform(): u = np.ones(2)/2. x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_log_nonuniform(): u = np.array([.2, .8]) x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_log_nonuniform(): u = np.array([.2, .8]) x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True) assert len(x) == N_DRAWS assert not np.any(np.isnan(x)) assert binerr(x, u * N_DRAWS) < .05
def test_discrete_draw_log_point(): n_samples = 1000 u = np.array([0, 1]) x = utils.discrete_draw(np.log(u), n=n_samples, logp=True) assert len(x) == n_samples assert not np.any(x == 0) assert not np.any(np.isnan(x))
def __init__(self, docs, n_topics, n_words, alpha=1.0, beta=1.0, init_mode='prior', seed=None): """ Parameters ---------- docs : list<dict> list of document data structures n_topics : int number of topics n_words : int number of words in corpus (vocabulary) alpha : float (0, Inf), optional symmetric Dirchlet parameter for word/document distribution beta : float (0, Inf), optional symmetric Drichlet parameter for topic/document distribution """ self._docs = docs self._n_docs = len(docs) self._n_topics = n_topics self._n_words = n_words self._alpha = alpha self._beta = beta # number of words assigned to topic k in doc d self._n_dk = np.zeros((self._n_docs, self._n_topics,)) # number of times word w is assigned to topic k self._n_kw = np.zeros((self._n_topics, self._n_words,)) # number of times any word is assigned to topic k self._n_k = np.zeros((1, self._n_topics,)) # Entry z[d][w] is the topic to which the w^th word in document d is # assigned self._z = [] self._key = [] for d, doc in enumerate(self._docs): self._z.append([]) if init_mode == 'prior': theta_k = dirichlet.rvs([self._alpha]*self._n_topics)[0] elif init_mode == 'random': theta_k = np.ones(self._n_topics)/self._n_topics else: raise ValueError("init_mode must be 'random' or 'prior'") for w, wrd in enumerate(doc['w']): topic = int(discrete_draw(theta_k)) self._z[d].append(topic) self._n_dk[d, topic] += 1.0 self._n_kw[topic, wrd] += 1.0 self._n_k[0, topic] += 1.0 self._key.append((d, wrd, w,))
def step(self, d, w): word = self._docs[d]['w'][w] self._remove_word(d, w) q = 0 for k in self._word_topics[word]: q += self._q_cache[d, k] * self._n_kw[k, word] u = random.random() * (self._s + self._r + q) t = None if u < self._s: # uniform t = discrete_draw(-np.log(self._cnst), logp=True) elif u < self._s + self._r: # document ks = [k for k in self._doc_topics[d]] logps = np.zeros(len(self._doc_topics[d])) for i, k in enumerate(self._doc_topics[d]): logps[i] = log(self._n_dk[d, k]) + log( self._beta / self._cnst[k]) t = ks[discrete_draw(logps, logp=True)] else: # word ks = [k for k in self._word_topics[word]] logps = np.zeros(len(self._word_topics[word])) for i, k in enumerate(self._word_topics[word]): logps[i] = log(self._n_kw[k, word]) + log(self._q_cache[d, k]) t = ks[discrete_draw(logps, logp=True)] self._insert_word(d, w, t)
def step(self): for d, word, w in self._key: topic = self._z[d][w] self._n_dk[d, topic] -= 1.0 self._n_kw[topic, word] -= 1.0 self._n_k[0, topic] -= 1.0 logp_k = np.zeros(self._n_topics) for k in range(self._n_topics): logp_k[k] = self.log_conditional(d, k, word) topic = discrete_draw(logp_k, logp=True) self._z[d][w] = topic self._n_dk[d, topic] += 1.0 self._n_kw[topic, word] += 1.0 self._n_k[0, topic] += 1.0
def _forward_sample(ldactr, n_docs, n_topics, n_words, n_words_per_doc, alpha, beta): # docs, _ = gen_docs(n_docs, n_topics, n_words, n_words_per_doc, alpha, beta) # lda = ldactr(docs, n_topics, n_words, alpha, beta, seed) # z, asgn, n_dk, _ = _get_lda_properties(lda) docs = [{'w': [0]*n_words_per_doc} for _ in range(n_docs)] n_dk = np.zeros((n_docs, n_topics,)) n_kw = np.zeros((n_topics, n_words,)) asgn = [] for d in range(n_docs): asgn_d = [] theta_d = dirichlet.rvs([alpha]*n_topics)[0] for _ in range(n_words_per_doc): k = discrete_draw(theta_d) asgn_d.append(k) n_dk[d, k] += 1 asgn.append(asgn_d) z = [item for sublist in asgn for item in sublist] docs, phi = _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=True) return docs, z, phi
def _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=False): n_dk = np.array(n_dk) n_kw = np.array(n_kw) n_topics = len(n_kw) n_words = len(n_kw[0]) n_d = np.sum(n_dk, axis=1) assert len(n_d) == len(docs) for d, doc in enumerate(docs): for w, wrd in enumerate(doc['w']): tpc = asgn[d][w] n_dk[d, tpc] -= 1 n_d[d] -= 1 if not init: n_kw[tpc, wrd] -= 1 logps = np.zeros(n_words) for widx in range(n_words): logp = log(n_kw[tpc, widx] + beta) logp += log(n_dk[d, tpc] + alpha) logp -= log(n_d[d] + alpha*n_topics) logps[widx] = logp new_wrd = discrete_draw(logps, logp=True) docs[d]['w'][w] = new_wrd n_dk[d, tpc] += 1 n_kw[tpc, new_wrd] += 1 n_d[d] += 1 assert np.sum(n_dk) == np.sum(n_kw) assert np.sum(n_d) == np.sum(n_dk) phi = n_kw + beta phi /= np.sum(phi, axis=1)[:, np.newaxis] return docs, phi
def _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=False): n_dk = np.array(n_dk) n_kw = np.array(n_kw) n_topics = len(n_kw) n_words = len(n_kw[0]) n_d = np.sum(n_dk, axis=1) assert len(n_d) == len(docs) for d, doc in enumerate(docs): for w, wrd in enumerate(doc['w']): tpc = asgn[d][w] n_dk[d, tpc] -= 1 n_d[d] -= 1 if not init: n_kw[tpc, wrd] -= 1 logps = np.zeros(n_words) for widx in range(n_words): logp = log(n_kw[tpc, widx] + beta) logp += log(n_dk[d, tpc] + alpha) logp -= log(n_d[d] + alpha * n_topics) logps[widx] = logp new_wrd = discrete_draw(logps, logp=True) docs[d]['w'][w] = new_wrd n_dk[d, tpc] += 1 n_kw[tpc, new_wrd] += 1 n_d[d] += 1 assert np.sum(n_dk) == np.sum(n_kw) assert np.sum(n_d) == np.sum(n_dk) phi = n_kw + beta phi /= np.sum(phi, axis=1)[:, np.newaxis] return docs, phi
def __init__(self, docs, n_topics, n_words, alpha=1.0, beta=1.0, init_mode='prior', seed=None): """ Parameters ---------- docs : list<dict> list of document data structures n_topics : int number of topics n_words : int number of words in corpus (vocabulary) alpha : float (0, Inf), optional symmetric Dirchlet parameter for word/document distribution beta : float (0, Inf), optional symmetric Drichlet parameter for topic/document distribution """ self._docs = docs self._n_docs = len(docs) self._n_topics = n_topics self._n_words = n_words self._alpha = alpha self._beta = beta # number of words assigned to topic k in doc d self._n_dk = np.zeros(( self._n_docs, self._n_topics, )) # number of times word w is assigned to topic k self._n_kw = np.zeros(( self._n_topics, self._n_words, )) # number of times any word is assigned to topic k self._n_k = np.zeros(( 1, self._n_topics, )) # Entry z[d][w] is the topic to which the w^th word in document d is # assigned self._z = [] self._key = [] for d, doc in enumerate(self._docs): self._z.append([]) if init_mode == 'prior': theta_k = dirichlet.rvs([self._alpha] * self._n_topics)[0] elif init_mode == 'random': theta_k = np.ones(self._n_topics) / self._n_topics else: raise ValueError("init_mode must be 'random' or 'prior'") for w, wrd in enumerate(doc['w']): topic = int(discrete_draw(theta_k)) self._z[d].append(topic) self._n_dk[d, topic] += 1.0 self._n_kw[topic, wrd] += 1.0 self._n_k[0, topic] += 1.0 self._key.append(( d, wrd, w, ))