コード例 #1
0
def generate_data(V, D, l, alpha, beta):
    """
    Generates a synthetic corpus of documents from a Dirichlet process
    mixture model with multinomial mixture components (topics). The
    mixture components are drawn from a symmetric Dirichlet prior.

    Arguments:

    V -- vocabulary size
    D -- number of documents
    l -- average document length
    alpha -- concentration parameter for the Dirichlet process
    beta -- concentration parameter for the symmetric Dirichlet prior
    """

    T = D # maximum number of topics

    phi_TV = zeros((T, V))
    z_D = zeros(D, dtype=int)
    N_DV = zeros((D, V), dtype=int)

    for d in xrange(D):

        # draw a topic assignment for this document

        dist = bincount(z_D).astype(float)
        dist[0] = alpha
        [t] = sample(dist)
        t = len(dist) if t == 0 else t
        z_D[d] = t

        # if it's a new topic, draw the parameters for that topic

        if t == len(dist):
            phi_TV[t - 1, :] = dirichlet(beta * ones(V) / V)

        # draw the tokens from the topic

        for v in sample(phi_TV[t - 1, :], num_samples=poisson(l)):
            N_DV[d, v] += 1

    z_D = z_D - 1

    return phi_TV, z_D, N_DV
コード例 #2
0
ファイル: getting_it_right.py プロジェクト: hijbul/dpmm
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s):
    """
    Runs Geweke's "getting it right" test.
    """

    seed(s)

    # generate forward samples via the generative process

    print "Generating forward samples..."

    forward_samples = []

    for _ in iterview(xrange(num_itns)):
        forward_samples.append(generate_data(V, D, l, alpha, beta)[1:])

    # generate reverse samples via the inference algorithm

    print "Generating reverse samples..."

    reverse_samples = []

    phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta)

    for _ in iterview(xrange(num_itns)):

        N_DV = zeros((D, V), dtype=int)

        if algorithm.__name__ == "algorithm_8" or algorithm.__name__ == "nonconjugate_split_merge":
            for d in xrange(D):
                for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)):
                    N_DV[d, v] += 1

            phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        else:

            T = D  # maximum number of topics

            N_TV = zeros((T, V), dtype=int)
            N_T = zeros(T, dtype=int)

            for d in xrange(D):
                t = z_D[d]
                for _ in xrange(poisson(l)):
                    [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta))
                    N_DV[d, v] += 1
                    N_TV[t, v] += 1
                    N_T[t] += 1

            z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        z_D_copy = empty_like(z_D)
        z_D_copy[:] = z_D

        reverse_samples.append((z_D_copy, N_DV))

    print "Computing test statistics..."

    # test statistics: number of topics, maximum topic size, mean
    # topic size, standard deviation of topic sizes

    # compute test statistics for forward samples

    forward_num_topics = []
    forward_max_topic_size = []
    forward_mean_topic_size = []
    forward_std_topic_size = []

    for z_D, _ in forward_samples:
        forward_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        forward_max_topic_size.append(topic_sizes.max())
        forward_mean_topic_size.append(topic_sizes.mean())
        forward_std_topic_size.append(topic_sizes.std())

    # compute test statistics for reverse samples

    reverse_num_topics = []
    reverse_max_topic_size = []
    reverse_mean_topic_size = []
    reverse_std_topic_size = []

    for z_D, _ in reverse_samples:
        reverse_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        reverse_max_topic_size.append(topic_sizes.max())
        reverse_mean_topic_size.append(topic_sizes.mean())
        reverse_std_topic_size.append(topic_sizes.std())

    # generate P-P plots

    pp_plot(array(forward_num_topics), array(reverse_num_topics))
    pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size))
    pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size))
    pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))
コード例 #3
0
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s):
    """
    Runs Geweke's "getting it right" test.
    """

    seed(s)

    # generate forward samples via the generative process

    print 'Generating forward samples...'

    forward_samples = []

    for _ in iterview(xrange(num_itns)):
        forward_samples.append(generate_data(V, D, l, alpha, beta)[1:])

    # generate reverse samples via the inference algorithm

    print 'Generating reverse samples...'

    reverse_samples = []

    phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta)

    for _ in iterview(xrange(num_itns)):

        N_DV = zeros((D, V), dtype=int)

        if (algorithm.__name__ == 'algorithm_8'
                or algorithm.__name__ == 'nonconjugate_split_merge'):
            for d in xrange(D):
                for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)):
                    N_DV[d, v] += 1

            phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        else:

            T = D  # maximum number of topics

            N_TV = zeros((T, V), dtype=int)
            N_T = zeros(T, dtype=int)

            for d in xrange(D):
                t = z_D[d]
                for _ in xrange(poisson(l)):
                    [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta))
                    N_DV[d, v] += 1
                    N_TV[t, v] += 1
                    N_T[t] += 1

            z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        z_D_copy = empty_like(z_D)
        z_D_copy[:] = z_D

        reverse_samples.append((z_D_copy, N_DV))

    print 'Computing test statistics...'

    # test statistics: number of topics, maximum topic size, mean
    # topic size, standard deviation of topic sizes

    # compute test statistics for forward samples

    forward_num_topics = []
    forward_max_topic_size = []
    forward_mean_topic_size = []
    forward_std_topic_size = []

    for z_D, _ in forward_samples:
        forward_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        forward_max_topic_size.append(topic_sizes.max())
        forward_mean_topic_size.append(topic_sizes.mean())
        forward_std_topic_size.append(topic_sizes.std())

    # compute test statistics for reverse samples

    reverse_num_topics = []
    reverse_max_topic_size = []
    reverse_mean_topic_size = []
    reverse_std_topic_size = []

    for z_D, _ in reverse_samples:
        reverse_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        reverse_max_topic_size.append(topic_sizes.max())
        reverse_mean_topic_size.append(topic_sizes.mean())
        reverse_std_topic_size.append(topic_sizes.std())

    # generate P-P plots

    pp_plot(array(forward_num_topics), array(reverse_num_topics))
    pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size))
    pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size))
    pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))