Example #1
0
    def initialize(self, K):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K

        D = self.D
        W = self.W

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alpha = np.ones((K,)) * (3.0 / K)

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        document_Nds = self.num_words_per(self.documents)
        self.phi = [(np.ones((document_Nds[d], K))*(1.0/K)) for d in xrange(D)]

        self.gamma = np.ones((D, K)) * (1.0 / K)
        graphlib.initialize_random(self.gamma)

        self.is_initialized = True
Example #2
0
def lm_E_step_for_doc(global_iteration,
                        d, document, comment, 
                        alphaD, alphaC, 
                        betaD, betaC, 
                        gammaD, gammaC, 
                        phiD, phiC, 
                        y, eta, sigma_squared):
    """Given phi and gamma matrices and document of the document.
        Recalculate phi and gamma repeatedly iteratively.
        Uses local elbo calculation to check for convergence.
    """
    print "starting E step on doc {0}".format(d)
    graphlib.initialize_random(phiD)
    graphlib.initialize_random(phiC)

    i = 0
    last_local_elbo, local_elbo = graphlib.INITIAL_ELBO - 100, graphlib.INITIAL_ELBO
    while graphlib.elbo_did_not_converge(local_elbo, last_local_elbo, i, 
                                            criterion=0.1, max_iter=20):
        print 'will update gamma...'
        # update gammas
        lda_update_gamma(alphaD, phiD, gammaD)
        lda_update_gamma(alphaC, phiC, gammaC)

        Nd,Kd = phiD.shape

        print 'will update phis...'
        # update phis (note we have to pass the right part of eta!)
        slda_update_phi(document, phiD, gammaD, betaD, y[d], eta[:Kd], sigma_squared)
        slda_update_phi(comment, phiC, gammaC, betaC, y[d], eta[Kd:], sigma_squared)

        print 'will calculate y...'
        # update the response variable
        # y = ηTE[Z] = ηTφ      [  where φ = 1/N * Σnφn   ]
        y[d] = np.dot(eta, calculate_EZ_from_small_phis(phiD, phiC))

        if i % 2 == 0:
            print 'will calculate elbo...'
            # calculate new ELBO
            last_local_elbo = local_elbo
            local_elbo = lm_local_elbo(document, comment, alphaD, alphaC, betaD, betaC, gammaD, gammaC, phiD, phiC, y[d], eta, sigma_squared)
        i += 1

        #print {'beta': (betaD, betaC), 'gamma': (gammaD, gammaC), 'phi': (phiD, phiC), 'y': y, 'eta': eta}
        print "{2}: e-step iteration {0} ELBO: {1}".format(i, local_elbo, global_iteration)
    print "{2}: done e-step on doc {3}: {0} iterations ELBO: {1}".format(i, local_elbo, global_iteration, d)
    return i
Example #3
0
def partial_slda_E_step_for_doc(global_iteration, 
                                last_local_iterations,
                                d, document, y,
                                alpha, beta, gamma, phi,
                                eta, sigma_squared):
    """Same as sLDA e-step, but slightly different phi update,
        and slightly different elbo calculation.
    """
    #print "starting E step on doc {0}".format(d)
    graphlib.initialize_random(phi)

    ensure(phi.shape[1] == beta.shape[0] == len(gamma) == len(alpha))
    ensure(phi.shape[0] == len(document))
    ensure(len(eta) < phi.shape[1]) # is partial

    i = 0
    min_iter = 20 - global_iteration
    max_iter = last_local_iterations if last_local_iterations > 0 else 20
    last_local_elbo, local_elbo = graphlib.INITIAL_ELBO - 100, graphlib.INITIAL_ELBO
    while graphlib.elbo_did_not_converge(local_elbo, last_local_elbo, i, 
                                        criterion=0.01,
                                        min_iter=min_iter, max_iter=max_iter):
        #print 'will update gamma...'
        # update gammas
        lda_update_gamma(alpha, phi, gamma)

        #print 'will update phis...'
        ensure(len(eta) < phi.shape[1]) #otherwise it's a full update
        partial_slda_update_phi(document, phi, gamma, beta, y, eta, sigma_squared)

        # speed things up by maxing out in first five E runs
        # also use same as last local iterations
        if last_local_iterations == 0:
            #print 'will calculate elbo...'
            # calculate new ELBO
            last_local_elbo = local_elbo
            local_elbo = partial_slda_local_elbo(document, y, 
                                                    alpha, beta, gamma, phi, 
                                                    eta, sigma_squared)
        i += 1

        #print {'beta': beta, 'gamma': gamma, 'phi': phi, 'y': y, 'eta': eta}
        #print "{2}: e-step iteration {0} ELBO: {1}".format(i, local_elbo, global_iteration)
    if d % 100 == 0:
        print "{2}: done pSLDA e-step on doc {3}: {0} iterations ELBO: {1}".format(i, local_elbo, global_iteration, d)
    return i
Example #4
0
def test_initialize_random():
    original = np.ones((4,7))
    out = original.copy()
    graphlib.initialize_random(out)
    assert original.shape == out.shape

    assert not same(out, original)

    sumrows = np.sum(out, axis=1)
    assert same(sumrows, np.ones(out.shape[0]))

    # now test log of the same
    original = np.ones((4,7))
    out = original.copy()
    graphlib.initialize_log_random(out)
    assert original.shape == out.shape

    assert not same(out, original)

    sumrows = graphlib.logsumexp(out, axis=1)
    assert same(np.exp(sumrows), np.ones(out.shape[0]))
Example #5
0
def slda_E_step_for_doc(global_iteration, 
                        last_local_iterations,
                        d, document, y,
                        alpha, beta, gamma, phi,
                        eta, sigma_squared):
    """Given phi and gamma matrices and document of the document.
        Recalculate phi and gamma repeatedly iteratively.
        Also recalculate y.
        Uses local elbo calculation to check for convergence.
    """
    print "starting E step on doc {0}".format(d)
    graphlib.initialize_random(phi)

    i = 0
    max_iter = last_local_iterations if last_local_iterations > 0 else 20
    last_local_elbo, local_elbo = graphlib.INITIAL_ELBO - 100, graphlib.INITIAL_ELBO
    while graphlib.elbo_did_not_converge(local_elbo, last_local_elbo, i, 
                                            criterion=0.01, max_iter=max_iter):
        #print 'will update gamma...'
        # update gammas
        lda_update_gamma(alpha, phi, gamma)

        #print 'will update phis...'
        slda_update_phi(document, phi, gamma, beta, y, eta, sigma_squared)

        # speed things up by maxing out in first five E runs
        # also use same as last local iterations
        if last_local_iterations == 0:
            #print 'will calculate elbo...'
            # calculate new ELBO
            last_local_elbo = local_elbo
            local_elbo = slda_local_elbo(document, y, 
                                         alpha, beta, gamma, phi, 
                                         eta, sigma_squared)
        i += 1

        #print {'beta': beta, 'gamma': gamma, 'phi': phi, 'y': y, 'eta': eta}
        #print "{2}: e-step iteration {0} ELBO: {1}".format(i, local_elbo, global_iteration)
    print "{2}: done e-step on doc {3}: {0} iterations ELBO: {1}".format(i, local_elbo, global_iteration, d)
    return i
Example #6
0
def lda_E_step_for_doc(global_iteration, 
                        last_local_iterations,
                        d, document,
                        alpha, beta,
                        gamma, phi):
    """Given phi and gamma matrices and document of the document.
        Recalculate phi and gamma repeatedly iteratively.
        Uses local elbo calculation to check for convergence.
    """
    #print "starting E step on doc {0}".format(d)
    graphlib.initialize_random(phi)

    ensure(phi.shape[1] == beta.shape[0] == len(gamma) == len(alpha))
    ensure(phi.shape[0] == len(document))

    i = 0
    min_iter = 20 - global_iteration
    max_iter = last_local_iterations if last_local_iterations > 0 else 20
    last_local_elbo, local_elbo = graphlib.INITIAL_ELBO - 100, graphlib.INITIAL_ELBO
    while graphlib.elbo_did_not_converge(local_elbo, last_local_elbo, i, 
                                        criterion=0.1, 
                                        min_iter=min_iter, max_iter=max_iter):
        #print 'will update gamma...'
        lda_update_gamma(alpha, phi, gamma)

        #print 'will update phis...'
        lda_update_phi(document, phi, gamma, beta)

        if last_local_iterations == 0:
            #print 'will calculate elbo...'
            last_local_elbo = local_elbo
            local_elbo = lda_local_elbo(document, alpha, beta, gamma, phi)
        i += 1

        #print {'beta': beta, 'gamma': gamma, 'phi': phi}
        #print "{2}: e-step iteration {0} ELBO: {1}".format(i, local_elbo, global_iteration)
    if d % 100 == 0:
        print "{2}: done LDA e-step on doc {3}: {0} iterations ELBO: {1}".format(i, local_elbo, global_iteration, d)
    return i
Example #7
0
    def initialize(self, Ku, Ks, Kb):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None
        assert Ku is not None
        assert Ks is not None
        assert Kb is not None

        K = Ku + Ks + Kb

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K
        self.Ku = Ku
        self.Ks = Ks
        self.Kb = Kb

        self.Kc = self.Ku + self.Ks
        self.Kl = self.Ks + self.Kb

        W = self.W

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alphaU = np.ones((Ku,)) * (1.0 / Ku)
        self.alphaS = np.ones((Ks,)) * (1.0 / Ks)
        self.alphaB = np.ones((Kb,)) * (1.0 / Kb)

        # todo: not using this yet
        #self.alphaD = ...
        
        def uniform_phi(Nds, size):
            D = len(Nds)
            return [(np.ones((Nds[d], size)) * (1.0 / size)) for d in xrange(D)]

        document_Nds = self.num_words_per(self.documents)
        self.phiD = uniform_phi(document_Nds, self.Ku)
        comment_Nds = self.num_words_per(self.comments)
        self.phiC = uniform_phi(comment_Nds, self.Kc)
        labeled_Nds = self.num_words_per(self.labeled)
        self.phiL = uniform_phi(labeled_Nds, self.Kl)
        background_Nds = self.num_words_per(self.background)
        self.phiB = uniform_phi(background_Nds, self.Kb)

        self.num_document_words = sum(document_Nds)
        self.num_comment_words = sum(comment_Nds)
        self.num_labeled_words = sum(labeled_Nds)
        self.num_background_words = sum(background_Nds)

        biggest = float(max(self.num_document_words, self.num_comment_words,
                      self.num_labeled_words, self.num_background_words))
        self.document_multiplier = biggest / self.num_document_words
        self.comment_multiplier = biggest / self.num_comment_words
        self.labeled_multiplier = biggest / self.num_labeled_words
        self.background_multiplier = biggest / self.num_background_words

        self.gammaD = np.ones((self.D, self.Ku)) * (1.0 / self.Ku)
        self.gammaC = np.ones((self.D, self.Kc)) * (1.0 / self.Kc)
        self.gammaL = np.ones((self.L, self.Kl)) * (1.0 / self.Kl)
        self.gammaB = np.ones((self.B, self.Kb)) * (1.0 / self.Kb)
        graphlib.initialize_random(self.gammaD)
        graphlib.initialize_random(self.gammaC)
        graphlib.initialize_random(self.gammaL)
        graphlib.initialize_random(self.gammaB)

        self.eta = graphlib.random_normal(0, 2.0, (Ks,))
        self.sigma_squared = 0.5

        print 'eta start: {0}'.format(self.eta)

        self.is_initialized = True