Example #1
0
    def test_basics(self):
        W = 102
        D = 1010
        K = 11
        alpha = .27
        eta = 3.1

        model = OnlineLDA(num_words=W,
                          num_topics=K,
                          num_documents=D,
                          alpha=alpha,
                          eta=eta)

        self.assertEqual(K, model.num_topics)
        self.assertEqual(K, model.alpha.size)
        self.assertEqual(D, model.num_documents)
        self.assertEqual(W, model.num_words)
        self.assertEqual(alpha, model.alpha.ravel()[randint(0, K - 1)])
        self.assertEqual(eta, model.eta)

        with self.assertRaises(RuntimeError):
            model.alpha = random.rand(K + 1)

        alpha = random.rand(K, 1)
        model.alpha = alpha
        self.assertLess(max(abs(model.alpha.ravel() - alpha.ravel())), 1e-20)
Example #2
0
    def test_lower_bound(self):
        W = 100
        K = 22
        D = 30
        N = 60

        # generate random vocabulary
        vocab = [
            ''.join(
                choice(ascii_letters) for _ in range(5 + random.randint(10)))
            for _ in range(W)
        ]
        model0 = ReferenceLDA(vocab, K, D, 0.1, 0.3, 1024., 0.9)

        model1 = OnlineLDA(num_words=W, num_topics=K, num_documents=D)
        model1.alpha = model0._alpha
        model1.lambdas = model0._lambda

        # generate D/2 random documents of average length N
        docs1 = model1.sample(D // 2, N)
        docs0 = [zip(*doc) for doc in docs1]

        # estimate lower bound
        elbo1 = model1.lower_bound(docs1)
        elbo0 = model0.approx_bound(docs0)

        # estimate should deviate less than 1% from reference estimate
        self.assertLess(abs(elbo1 - elbo0) / abs(elbo0), 0.01)
Example #3
0
    def test_empirical_bayes_alpha(self):
        model = OnlineLDA(num_words=4,
                          num_topics=2,
                          num_documents=1000,
                          alpha=[.2, .01],
                          eta=.2)

        model.lambdas = [[100, 100, 1e-16, 1e-16], [1e-16, 1e-16, 100, 100]]

        documents = model.sample(100, 10)

        # set alpha to wrong values
        model.alpha = [4., 4.]

        for i in range(100):
            model.update_parameters(documents,
                                    rho=.1,
                                    max_iter_tr=0,
                                    update_lambda=False,
                                    update_alpha=True)

        # make sure empirical Bayes went in the right direction
        self.assertGreater(model.alpha[0], model.alpha[1])
        self.assertLess(model.alpha[0], 4.)
        self.assertLess(model.alpha[1], 4.)
Example #4
0
    def test_vi(self):
        W = 100
        K = 20
        D = 10
        N = 100

        # generate random vocabulary
        vocab = [
            ''.join(
                choice(ascii_letters) for _ in range(5 + random.randint(10)))
            for _ in range(W)
        ]
        model0 = ReferenceLDA(vocab, K, D, 0.1, 0.3, 1024., 0.9)

        model1 = OnlineLDA(num_words=W, num_topics=K, num_documents=D)
        model1.alpha = model0._alpha
        model1.lambdas = model0._lambda

        # generate D random documents of length up to N
        docs1 = []
        for _ in range(D):
            docs1.append([
                (w, random.randint(10))
                for w in random.permutation(W)[:1 + random.randint(N)]
            ])
        docs0 = [zip(*doc) for doc in docs1]

        # use the same initialization of gamma
        initial_gamma = random.gamma(100., 1. / 100., [K, D])

        gamma0, sstats0 = model0.do_e_step(docs0,
                                           max_steps=50,
                                           gamma=initial_gamma.T)
        gamma1, sstats1 = model1.do_e_step(docs1,
                                           max_iter=50,
                                           latents=initial_gamma)

        # make sure e-Step gives the same results
        self.assertGreater(
            corrcoef(gamma0.T.ravel(), gamma1.ravel())[0, 1], 0.99)
        self.assertGreater(
            corrcoef(sstats0.ravel(), sstats1.ravel())[0, 1], 0.99)