def log_predictive_prob(self, new_corpus, num_samples): D, V, T = self.D, self.V, self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m D_plus_alpha = self.D_plus_alpha Nvt_new, Nt_new, Dt_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((T, V), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Dt_new.append(zeros(T, dtype=int)) z_new.append(zeros(len(new_corpus), dtype=int)) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): for prev_d in xrange(0, d): prev_doc = corpus.documents[prev_d] t = z_new[r][prev_d] Nvt_new[r][t, :] -= prev_doc.Nv Nt_new[r][t] -= len(prev_doc) Dt_new[r][t] -= 1 t = log_sample(gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m)) Nvt_new[r][t, :] += prev_doc.Nv Nt_new[r][t] += len(prev_doc) Dt_new[r][t] += 1 z_new[r][prev_d] = t log_dist = gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m) - log(d + D_plus_alpha) tmp[r] = log_sum_exp(log_dist) t = log_sample(log_dist) Nvt_new[r][t, :] += doc.Nv Nt_new[r][t] += len(doc) Dt_new[r][t] += 1 z_new[r][d] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over document--component assignments (i.e., document groups) given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all document--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize document--component assignments """ corpus = self.corpus T = self.T alpha_m = self.alpha_m Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt = self.Dt z = self.z for d, (doc, t) in enumerate(iterview(zip(corpus, z))): if not init: Nvt_plus_beta_n[t, :] -= doc.Nv Nt_plus_beta[t] -= len(doc) Dt[t] -= 1 t = log_sample( gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_plus_beta) + log(Dt + alpha_m) ) Nvt_plus_beta_n[t, :] += doc.Nv Nt_plus_beta[t] += len(doc) Dt[t] += 1 z[d] = t
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over document--component assignments (i.e., document groups) given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all document--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize document--component assignments """ corpus = self.corpus T = self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m z = self.z for d, (doc, t) in enumerate(iterview(zip(corpus, z))): if not init: Nvt_plus_beta_n[t, :] -= doc.Nv Nt_plus_beta[t] -= len(doc) Dt_plus_alpha_m[t] -= 1 t = log_sample( gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_plus_beta) + log(Dt_plus_alpha_m)) Nvt_plus_beta_n[t, :] += doc.Nv Nt_plus_beta[t] += len(doc) Dt_plus_alpha_m[t] += 1 z[d] = t
def log_predictive_prob(self, new_corpus, num_samples): D, V, T = self.D, self.V, self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m D_plus_alpha = self.D_plus_alpha Nvt_new, Nt_new, Dt_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((T, V), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Dt_new.append(zeros(T, dtype=int)) z_new.append(zeros(len(new_corpus), dtype=int)) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): for prev_d in xrange(0, d): prev_doc = corpus.documents[prev_d] t = z_new[r][prev_d] Nvt_new[r][t, :] -= prev_doc.Nv Nt_new[r][t] -= len(prev_doc) Dt_new[r][t] -= 1 t = log_sample( gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln( tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln( len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m)) Nvt_new[r][t, :] += prev_doc.Nv Nt_new[r][t] += len(prev_doc) Dt_new[r][t] += 1 z_new[r][prev_d] = t pass # YOUR CODE GOES HERE Nvt_new[r][t, :] += doc.Nv Nt_new[r][t] += len(doc) Dt_new[r][t] += 1 z_new[r][d] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p