def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None): """ Nonconjugate split-merge. """ M = 10 # number of auxiliary samples D, V = N_DV.shape T = D + M - 1 # maximum number of topics N_D = N_DV.sum(1) # document lengths phi_TV = zeros((T, V)) # topic parameters inv_z_T = defaultdict(set) for d in xrange(D): inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) # intialize topic parameters (necessary for Metropolis-Hastings only) for t in active_topics: phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V) for itn in xrange(num_itns): for _ in xrange(3): iteration( V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6 ) algorithm_8_iteration( V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T ) if true_z_D is not None: v = vi(true_z_D, z_D) print "Itn. %d" % (itn + 1) print "%d topics" % len(active_topics) print "VI: %f bits (%f bits max.)" % (v, log2(D)) if v < 1e-6: break return phi_TV, z_D
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None): """ Nonconjugate split-merge. """ M = 10 # number of auxiliary samples D, V = N_DV.shape T = D + M - 1 # maximum number of topics N_D = N_DV.sum(1) # document lengths phi_TV = zeros((T, V)) # topic parameters inv_z_T = defaultdict(set) for d in xrange(D): inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) # intialize topic parameters (necessary for Metropolis-Hastings only) for t in active_topics: phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V) for itn in xrange(num_itns): for _ in xrange(3): iteration(V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6) algorithm_8_iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T) if true_z_D is not None: v = vi(true_z_D, z_D) print 'Itn. %d' % (itn + 1) print '%d topics' % len(active_topics) print 'VI: %f bits (%f bits max.)' % (v, log2(D)) if v < 1e-6: break return phi_TV, z_D
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None): """ Conjugate split-merge. """ D, V = N_DV.shape T = D # maximum number of topics N_D = N_DV.sum(1) # document lengths inv_z_T = defaultdict(set) for d in xrange(D): inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) for itn in xrange(num_itns): for _ in xrange(3): iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6) algorithm_3_iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T) if true_z_D is not None: v = vi(true_z_D, z_D) print 'Itn. %d' % (itn + 1) print '%d topics' % len(active_topics) print 'VI: %f bits (%f bits max.)' % (v, log2(D)) if v < 1e-6: break return z_D
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None): """ Algorithm 8. """ M = 10 # number of auxiliary samples D, V = N_DV.shape T = D + M - 1 # maximum number of topics N_D = N_DV.sum(1) # document lengths phi_TV = zeros((T, V)) # topic parameters active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) for itn in xrange(num_itns): iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, None, active_topics, inactive_topics, N_TV, N_T, D_T) if true_z_D is not None: v = vi(true_z_D, z_D) print 'Itn. %d' % (itn + 1) print '%d topics' % len(active_topics) print 'VI: %f bits (%f bits max.)' % (v, log2(D)) if v < 1e-6: break return phi_TV, z_D
def inference_algorithm_3(N_DV, alpha, beta, num_itns=250, true_z_D=None): """ Algorithm 3. """ D, V = N_DV.shape T = D # maximum number of topics N_D = N_DV.sum(1) # document lengths N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) z_D = range(D) # intialize every document to its own topic active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) for itn in xrange(num_itns): for d in xrange(D): old_t = z_D[d] D_T[old_t] -= 1 N_TV[old_t, :] -= N_DV[d, :] N_T[old_t] -= N_D[d] log_dist = log(D_T) idx = old_t if D_T[old_t] == 0 else inactive_topics.pop() active_topics.add(idx) log_dist[idx] = log(alpha) for t in active_topics: log_dist[t] += gammaln(N_T[t] + beta) log_dist[t] -= gammaln(N_T[t] + N_D[d] + beta) tmp = N_TV[t, :] + beta / V log_dist[t] += gammaln(tmp + N_DV[d, :]).sum() log_dist[t] -= gammaln(tmp).sum() [t] = log_sample(log_dist) D_T[t] += 1 N_TV[t, :] += N_DV[d, :] N_T[t] += N_D[d] z_D[d] = t if t != idx: active_topics.remove(idx) inactive_topics.add(idx) if true_z_D is not None: print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D)) for t in active_topics: print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta) print len(active_topics) return z_D
def inference_algorithm_8(N_DV, alpha, beta, num_itns=250, true_z_D=None): """ Algorithm 8. """ M = 10 D, V = N_DV.shape T = D + M - 1 # maximum number of topics N_D = N_DV.sum(1) # document lengths N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) z_D = range(D) # intialize every document to its own topic phi_TV = zeros((T, V)) active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) for itn in xrange(num_itns): for t in active_topics: phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V, 1) for d in xrange(D): old_t = z_D[d] D_T[old_t] -= 1 N_TV[old_t, :] -= N_DV[d, :] N_T[old_t] -= N_D[d] log_dist = log(D_T) idx = -1 * ones(M, dtype=int) idx[0] = old_t if D_T[old_t] == 0 else inactive_topics.pop() for m in xrange(1, M): idx[m] = inactive_topics.pop() active_topics |= set(idx) log_dist[idx] = log(alpha) - log(M) if idx[0] == old_t: phi_TV[idx[1:], :] = dirichlet(beta * ones(V) / V, M - 1) else: phi_TV[idx, :] = dirichlet(beta * ones(V) / V, M) for t in active_topics: log_dist[t] += (N_DV[d, :] * log(phi_TV[t, :])).sum() [t] = log_sample(log_dist) D_T[t] += 1 N_TV[t, :] += N_DV[d, :] N_T[t] += N_D[d] z_D[d] = t idx = set(idx) idx.discard(t) active_topics -= idx inactive_topics |= idx if true_z_D is not None: print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D)) for t in active_topics: print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta) print len(active_topics) return z_D
if true_z_D is not None: print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D)) for t in active_topics: print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta) print len(active_topics) return z_D if __name__ == '__main__': V = 5 D = 1000 l = 1000 alpha = 1.0 beta = 0.1 * V z_D, phi_TV, N_DV = generate_data(V, D, l, alpha, beta) for t in argsort(bincount(z_D))[::-1]: idx, = where(z_D[:] == t) print len(idx), phi_TV[t, :] inf_z_D = inference_algorithm_3(N_DV, alpha, beta, 250, z_D) print 'VI: %f bits (%f bits max.)' % (vi(z_D, inf_z_D), log2(D)) inf_z_D = inference_algorithm_8(N_DV, alpha, beta, 250, z_D)