Example #1
0
    def test_speed(self):
        model1 = OnlineLDA(num_words=1000,
                           num_topics=100,
                           num_documents=10000,
                           alpha=.1,
                           eta=.3)

        # random vocabulary
        vocab = [
            ''.join(
                choice(ascii_letters) for _ in range(5 + random.randint(10)))
            for _ in range(model1.num_words)
        ]

        model0 = ReferenceLDA(vocab,
                              D=model1.num_documents,
                              K=model1.num_topics,
                              alpha=model1.alpha[0, 0],
                              eta=model1.eta,
                              kappa=.9,
                              tau0=1024)

        # generate D random documents of length up to N
        D = 110
        N = 600
        docs1 = []
        for _ in range(D):
            wordids = random.permutation(model1.num_words)[:1 +
                                                           random.randint(N)]
            docs1.append([(w, random.randint(10)) for w in wordids])
        docs0 = [zip(*doc) for doc in docs1]

        initial_gamma = random.gamma(100., 1. / 100., [model1.num_topics, D])

        start = time()
        gamma0, _ = model0.do_e_step(docs0,
                                     max_steps=100,
                                     gamma=initial_gamma.T)
        time0 = time() - start

        start = time()
        gamma1, _ = model1.do_e_step(docs1,
                                     max_iter=100,
                                     latents=initial_gamma)
        time1 = time() - start

        # make sure that C++ implementation is actually faster than Python version
        self.assertLess(
            time1,
            time0,
            msg=
            'Inference step took longer ({0:.2f} s) than reference implementation ({1:.2f})'
            .format(time1, time0))
Example #2
0
    def test_vi(self):
        W = 100
        K = 20
        D = 10
        N = 100

        # generate random vocabulary
        vocab = [
            ''.join(
                choice(ascii_letters) for _ in range(5 + random.randint(10)))
            for _ in range(W)
        ]
        model0 = ReferenceLDA(vocab, K, D, 0.1, 0.3, 1024., 0.9)

        model1 = OnlineLDA(num_words=W, num_topics=K, num_documents=D)
        model1.alpha = model0._alpha
        model1.lambdas = model0._lambda

        # generate D random documents of length up to N
        docs1 = []
        for _ in range(D):
            docs1.append([
                (w, random.randint(10))
                for w in random.permutation(W)[:1 + random.randint(N)]
            ])
        docs0 = [zip(*doc) for doc in docs1]

        # use the same initialization of gamma
        initial_gamma = random.gamma(100., 1. / 100., [K, D])

        gamma0, sstats0 = model0.do_e_step(docs0,
                                           max_steps=50,
                                           gamma=initial_gamma.T)
        gamma1, sstats1 = model1.do_e_step(docs1,
                                           max_iter=50,
                                           latents=initial_gamma)

        # make sure e-Step gives the same results
        self.assertGreater(
            corrcoef(gamma0.T.ravel(), gamma1.ravel())[0, 1], 0.99)
        self.assertGreater(
            corrcoef(sstats0.ravel(), sstats1.ravel())[0, 1], 0.99)