Beispiel #1
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Beispiel #2
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]),
                  type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(
                type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(
                type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(
                type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Beispiel #3
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m = LdaCgsSeq(c, 'document', K=10)
        m.train(n_iterations=50, verbose=0)

        self.ldav = LdaCgsViewer(c, m)
Beispiel #4
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)
        
        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Beispiel #5
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Beispiel #6
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1,s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)
            

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Beispiel #7
0
    def test_randomSeed(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=50, verbose=0)
        m1.train(n_iterations=50, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed

        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in range(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in range(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
Beispiel #8
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1, s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Beispiel #9
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)

        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Beispiel #10
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Beispiel #11
0
    def test_randomSeed(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=50, verbose=0)
        m1.train(n_iterations=50, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed
        
        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in xrange(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in xrange(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())