def generate_data(V, D, l, alpha, beta): """ Generates a synthetic corpus of documents from a Dirichlet process mixture model with multinomial mixture components (topics). The mixture components are drawn from a symmetric Dirichlet prior. Arguments: V -- vocabulary size D -- number of documents l -- average document length alpha -- concentration parameter for the Dirichlet process beta -- concentration parameter for the symmetric Dirichlet prior """ T = D # maximum number of topics phi_TV = zeros((T, V)) z_D = zeros(D, dtype=int) N_DV = zeros((D, V), dtype=int) for d in xrange(D): # draw a topic assignment for this document dist = bincount(z_D).astype(float) dist[0] = alpha [t] = sample(dist) t = len(dist) if t == 0 else t z_D[d] = t # if it's a new topic, draw the parameters for that topic if t == len(dist): phi_TV[t - 1, :] = dirichlet(beta * ones(V) / V) # draw the tokens from the topic for v in sample(phi_TV[t - 1, :], num_samples=poisson(l)): N_DV[d, v] += 1 z_D = z_D - 1 return phi_TV, z_D, N_DV
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s): """ Runs Geweke's "getting it right" test. """ seed(s) # generate forward samples via the generative process print "Generating forward samples..." forward_samples = [] for _ in iterview(xrange(num_itns)): forward_samples.append(generate_data(V, D, l, alpha, beta)[1:]) # generate reverse samples via the inference algorithm print "Generating reverse samples..." reverse_samples = [] phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta) for _ in iterview(xrange(num_itns)): N_DV = zeros((D, V), dtype=int) if algorithm.__name__ == "algorithm_8" or algorithm.__name__ == "nonconjugate_split_merge": for d in xrange(D): for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)): N_DV[d, v] += 1 phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1) else: T = D # maximum number of topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): t = z_D[d] for _ in xrange(poisson(l)): [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta)) N_DV[d, v] += 1 N_TV[t, v] += 1 N_T[t] += 1 z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1) z_D_copy = empty_like(z_D) z_D_copy[:] = z_D reverse_samples.append((z_D_copy, N_DV)) print "Computing test statistics..." # test statistics: number of topics, maximum topic size, mean # topic size, standard deviation of topic sizes # compute test statistics for forward samples forward_num_topics = [] forward_max_topic_size = [] forward_mean_topic_size = [] forward_std_topic_size = [] for z_D, _ in forward_samples: forward_num_topics.append(len(unique(z_D))) topic_sizes = [] for t in unique(z_D): topic_sizes.append((z_D[:] == t).sum()) topic_sizes = array(topic_sizes) forward_max_topic_size.append(topic_sizes.max()) forward_mean_topic_size.append(topic_sizes.mean()) forward_std_topic_size.append(topic_sizes.std()) # compute test statistics for reverse samples reverse_num_topics = [] reverse_max_topic_size = [] reverse_mean_topic_size = [] reverse_std_topic_size = [] for z_D, _ in reverse_samples: reverse_num_topics.append(len(unique(z_D))) topic_sizes = [] for t in unique(z_D): topic_sizes.append((z_D[:] == t).sum()) topic_sizes = array(topic_sizes) reverse_max_topic_size.append(topic_sizes.max()) reverse_mean_topic_size.append(topic_sizes.mean()) reverse_std_topic_size.append(topic_sizes.std()) # generate P-P plots pp_plot(array(forward_num_topics), array(reverse_num_topics)) pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size)) pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size)) pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s): """ Runs Geweke's "getting it right" test. """ seed(s) # generate forward samples via the generative process print 'Generating forward samples...' forward_samples = [] for _ in iterview(xrange(num_itns)): forward_samples.append(generate_data(V, D, l, alpha, beta)[1:]) # generate reverse samples via the inference algorithm print 'Generating reverse samples...' reverse_samples = [] phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta) for _ in iterview(xrange(num_itns)): N_DV = zeros((D, V), dtype=int) if (algorithm.__name__ == 'algorithm_8' or algorithm.__name__ == 'nonconjugate_split_merge'): for d in xrange(D): for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)): N_DV[d, v] += 1 phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1) else: T = D # maximum number of topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): t = z_D[d] for _ in xrange(poisson(l)): [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta)) N_DV[d, v] += 1 N_TV[t, v] += 1 N_T[t] += 1 z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1) z_D_copy = empty_like(z_D) z_D_copy[:] = z_D reverse_samples.append((z_D_copy, N_DV)) print 'Computing test statistics...' # test statistics: number of topics, maximum topic size, mean # topic size, standard deviation of topic sizes # compute test statistics for forward samples forward_num_topics = [] forward_max_topic_size = [] forward_mean_topic_size = [] forward_std_topic_size = [] for z_D, _ in forward_samples: forward_num_topics.append(len(unique(z_D))) topic_sizes = [] for t in unique(z_D): topic_sizes.append((z_D[:] == t).sum()) topic_sizes = array(topic_sizes) forward_max_topic_size.append(topic_sizes.max()) forward_mean_topic_size.append(topic_sizes.mean()) forward_std_topic_size.append(topic_sizes.std()) # compute test statistics for reverse samples reverse_num_topics = [] reverse_max_topic_size = [] reverse_mean_topic_size = [] reverse_std_topic_size = [] for z_D, _ in reverse_samples: reverse_num_topics.append(len(unique(z_D))) topic_sizes = [] for t in unique(z_D): topic_sizes.append((z_D[:] == t).sum()) topic_sizes = array(topic_sizes) reverse_max_topic_size.append(topic_sizes.max()) reverse_mean_topic_size.append(topic_sizes.mean()) reverse_std_topic_size.append(topic_sizes.std()) # generate P-P plots pp_plot(array(forward_num_topics), array(reverse_num_topics)) pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size)) pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size)) pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))