Esempio n. 1
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Nonconjugate split-merge.
    """

    M = 10  # number of auxiliary samples

    D, V = N_DV.shape

    T = D + M - 1  # maximum number of topics

    N_D = N_DV.sum(1)  # document lengths

    phi_TV = zeros((T, V))  # topic parameters

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d)  # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    # intialize topic parameters (necessary for Metropolis-Hastings only)

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for itn in xrange(num_itns):

        for _ in xrange(3):
            iteration(
                V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6
            )

        algorithm_8_iteration(
            V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T
        )

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print "Itn. %d" % (itn + 1)
            print "%d topics" % len(active_topics)
            print "VI: %f bits (%f bits max.)" % (v, log2(D))

            if v < 1e-6:
                break

    return phi_TV, z_D
Esempio n. 2
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Nonconjugate split-merge.
    """

    M = 10 # number of auxiliary samples

    D, V = N_DV.shape

    T = D + M - 1 # maximum number of topics

    N_D = N_DV.sum(1) # document lengths

    phi_TV = zeros((T, V)) # topic parameters

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    # intialize topic parameters (necessary for Metropolis-Hastings only)

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for itn in xrange(num_itns):

        for _ in xrange(3):
            iteration(V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6)

        algorithm_8_iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T)

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print 'Itn. %d' % (itn + 1)
            print '%d topics' % len(active_topics)
            print 'VI: %f bits (%f bits max.)' % (v, log2(D))

            if v < 1e-6:
                break

    return phi_TV, z_D
Esempio n. 3
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Conjugate split-merge.
    """

    D, V = N_DV.shape

    T = D  # maximum number of topics

    N_D = N_DV.sum(1)  # document lengths

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d)  # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    for itn in xrange(num_itns):

        for _ in xrange(3):
            iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T,
                      active_topics, inactive_topics, N_TV, N_T, D_T, 6)

        algorithm_3_iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T,
                              active_topics, inactive_topics, N_TV, N_T, D_T)

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print 'Itn. %d' % (itn + 1)
            print '%d topics' % len(active_topics)
            print 'VI: %f bits (%f bits max.)' % (v, log2(D))

            if v < 1e-6:
                break

    return z_D
Esempio n. 4
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Conjugate split-merge.
    """

    D, V = N_DV.shape

    T = D # maximum number of topics

    N_D = N_DV.sum(1) # document lengths

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    for itn in xrange(num_itns):

        for _ in xrange(3):
            iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6)

        algorithm_3_iteration(V, D, N_DV, N_D, alpha, beta, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T)

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print 'Itn. %d' % (itn + 1)
            print '%d topics' % len(active_topics)
            print 'VI: %f bits (%f bits max.)' % (v, log2(D))

            if v < 1e-6:
                break

    return z_D
Esempio n. 5
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Algorithm 8.
    """

    M = 10 # number of auxiliary samples

    D, V = N_DV.shape

    T = D + M - 1 # maximum number of topics

    N_D = N_DV.sum(1) # document lengths

    phi_TV = zeros((T, V)) # topic parameters

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    for itn in xrange(num_itns):

        iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, None, active_topics, inactive_topics, N_TV, N_T, D_T)

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print 'Itn. %d' % (itn + 1)
            print '%d topics' % len(active_topics)
            print 'VI: %f bits (%f bits max.)' % (v, log2(D))

            if v < 1e-6:
                break

    return phi_TV, z_D
Esempio n. 6
0
File: crp.py Progetto: aschein/dpmm
def inference_algorithm_3(N_DV, alpha, beta, num_itns=250, true_z_D=None):
    """
    Algorithm 3.
    """

    D, V = N_DV.shape

    T = D # maximum number of topics

    N_D = N_DV.sum(1) # document lengths

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    z_D = range(D) # intialize every document to its own topic

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    for itn in xrange(num_itns):
        for d in xrange(D):

            old_t = z_D[d]

            D_T[old_t] -= 1
            N_TV[old_t, :] -= N_DV[d, :]
            N_T[old_t] -= N_D[d]

            log_dist = log(D_T)

            idx = old_t if D_T[old_t] == 0 else inactive_topics.pop()
            active_topics.add(idx)
            log_dist[idx] = log(alpha)

            for t in active_topics:
                log_dist[t] += gammaln(N_T[t] + beta)
                log_dist[t] -= gammaln(N_T[t] + N_D[d] + beta)
                tmp = N_TV[t, :] + beta / V
                log_dist[t] += gammaln(tmp + N_DV[d, :]).sum()
                log_dist[t] -= gammaln(tmp).sum()

            [t] = log_sample(log_dist)

            D_T[t] += 1
            N_TV[t, :] += N_DV[d, :]
            N_T[t] += N_D[d]

            z_D[d] = t

            if t != idx:
                active_topics.remove(idx)
                inactive_topics.add(idx)

        if true_z_D is not None:
            print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D))

        for t in active_topics:
            print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta)

        print len(active_topics)

    return z_D
Esempio n. 7
0
File: crp.py Progetto: aschein/dpmm
def inference_algorithm_8(N_DV, alpha, beta, num_itns=250, true_z_D=None):
    """
    Algorithm 8.
    """

    M = 10

    D, V = N_DV.shape

    T = D + M - 1 # maximum number of topics

    N_D = N_DV.sum(1) # document lengths

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    z_D = range(D) # intialize every document to its own topic

    phi_TV = zeros((T, V))

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    for itn in xrange(num_itns):

        for t in active_topics:
            phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V, 1)

        for d in xrange(D):

            old_t = z_D[d]

            D_T[old_t] -= 1
            N_TV[old_t, :] -= N_DV[d, :]
            N_T[old_t] -= N_D[d]

            log_dist = log(D_T)

            idx = -1 * ones(M, dtype=int)
            idx[0] = old_t if D_T[old_t] == 0 else inactive_topics.pop()
            for m in xrange(1, M):
                idx[m] = inactive_topics.pop()
            active_topics |= set(idx)
            log_dist[idx] = log(alpha) - log(M)

            if idx[0] == old_t:
                phi_TV[idx[1:], :] = dirichlet(beta * ones(V) / V, M - 1)
            else:
                phi_TV[idx, :] = dirichlet(beta * ones(V) / V, M)

            for t in active_topics:
                log_dist[t] += (N_DV[d, :] * log(phi_TV[t, :])).sum()

            [t] = log_sample(log_dist)

            D_T[t] += 1
            N_TV[t, :] += N_DV[d, :]
            N_T[t] += N_D[d]

            z_D[d] = t

            idx = set(idx)
            idx.discard(t)
            active_topics -= idx
            inactive_topics |= idx

        if true_z_D is not None:
            print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D))

        for t in active_topics:
            print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta)

        print len(active_topics)

    return z_D
Esempio n. 8
0
File: crp.py Progetto: aschein/dpmm
        if true_z_D is not None:
            print 'VI: %f bits (%f bits max.)' % (vi(true_z_D, z_D), log2(D))

        for t in active_topics:
            print D_T[t], (N_TV[t, :] + beta / V) / (N_TV[t, :].sum() + beta)

        print len(active_topics)

    return z_D


if __name__ == '__main__':

    V = 5
    D = 1000
    l = 1000
    alpha = 1.0
    beta = 0.1 * V

    z_D, phi_TV, N_DV = generate_data(V, D, l, alpha, beta)

    for t in argsort(bincount(z_D))[::-1]:
        idx, = where(z_D[:] == t)
        print len(idx), phi_TV[t, :]

    inf_z_D = inference_algorithm_3(N_DV, alpha, beta, 250, z_D)

    print 'VI: %f bits (%f bits max.)' % (vi(z_D, inf_z_D), log2(D))

    inf_z_D = inference_algorithm_8(N_DV, alpha, beta, 250, z_D)