Ejemplo n.º 1
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Ejemplo n.º 2
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m = LdaCgsSeq(c, 'document', K=10)
        m.train(n_iterations=50, verbose=0)

        self.ldav = LdaCgsViewer(c, m)
Ejemplo n.º 3
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)
        
        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Ejemplo n.º 4
0
    def test_continueTraining(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed
        assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any()
        assert m0._mtrand_state[2:] != m1._mtrand_state[2:]

        m0.train(n_iterations=3, verbose=0)

        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in range(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in range(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
Ejemplo n.º 5
0
    def test_LdaCgsQuerySampler_init(self):

        old_corp = Corpus([], remove_empty=False)
        old_corp.corpus = np.array([ 0, 1, 1, 0, 0, 1 ], dtype='i')
        old_corp.context_data = [ np.array([(3, ), (3, )], dtype=[('idx', 'i')]) ]
        old_corp.context_types = [ 'document' ]
        old_corp.words = np.array([ '0', '1' ], dtype='i')
        old_corp.words_int = { '0': 0, '1': 1 }

        new_corp = Corpus([], remove_empty=False)
        new_corp.corpus = np.array([ 0, 0 ], dtype='i')
        new_corp.context_data = [ np.array([(2, )], dtype=[('idx', 'i')]) ]
        new_corp.context_types = [ 'document' ]
        new_corp.words = np.array([ '0', '1' ], dtype='i')
        new_corp.words_int = { '0': 0, '1': 1 }

        m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2)
        m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i')
        m.word_top[:] = np.array([[ 1.01, 2.01 ],
                                  [ 2.01, 1.01 ]], dtype='d')
        m.top_doc[:] = np.array([[ 3.01, 0.01 ], 
                                 [ 0.01, 3.01 ]], dtype='d')
        m.inv_top_sums[:] = 1. / m.word_top.sum(0)

        q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp)
        self.assertTrue(q.V==2)
        self.assertTrue(q.K==2)
        self.assertTrue(len(q.corpus)==2)
        self.assertTrue((q.corpus==new_corp.corpus).all())
        self.assertTrue(len(q.indices)==1)
        self.assertTrue((q.indices==
                         new_corp.view_metadata('document')['idx']).all())
        self.assertTrue(q.word_top.shape==(2, 2))
        self.assertTrue((q.word_top==m.word_top).all())
        self.assertTrue(q.top_doc.shape==(2, 1))
        self.assertTrue((q.top_doc==np.array([[ 0.01 ],
                                              [ 0.01 ]],
                                              dtype=q.top_doc.dtype)).all())
        self.assertTrue(q.inv_top_sums.shape==(2, ))
        self.assertTrue((q.inv_top_sums==m.inv_top_sums).all())
        self.assertTrue(q.alpha.shape==(2, 1))
        self.assertTrue((q.alpha==m.alpha).all())
        self.assertTrue(q.beta.shape==(2, 1))
        self.assertTrue((q.beta==m.beta).all())
Ejemplo n.º 6
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1,s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)
            

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Ejemplo n.º 7
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)

        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Ejemplo n.º 8
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]),
                  type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(
                type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(
                type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(
                type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Ejemplo n.º 9
0
    def test_IndexedSymmArray(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq
        from vsm.viewer.ldacgsviewer import LdaCgsViewer

        c = random_corpus(50000, 1000, 0, 50)
        m = LdaCgsSeq(c, 'document', K=20)
        viewer = LdaCgsViewer(c, m)

        li = [0, 1, 10]
        isa = viewer.dismat_top(li)

        self.assertEqual(isa.shape[0], len(li))
Ejemplo n.º 10
0
    def test_continueTraining(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed
        assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any()
        assert m0._mtrand_state[2:] != m1._mtrand_state[2:]

        m0.train(n_iterations=3, verbose=0)
        
        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in xrange(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in xrange(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
Ejemplo n.º 11
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Ejemplo n.º 12
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Ejemplo n.º 13
0
    def test_LdaCgsQuerySampler_init(self):

        old_corp = Corpus([], remove_empty=False)
        old_corp.corpus = np.array([0, 1, 1, 0, 0, 1], dtype='i')
        old_corp.context_data = [
            np.array([(3, ), (3, )], dtype=[('idx', 'i')])
        ]
        old_corp.context_types = ['document']
        old_corp.words = np.array(['0', '1'], dtype='i')
        old_corp.words_int = {'0': 0, '1': 1}

        new_corp = Corpus([], remove_empty=False)
        new_corp.corpus = np.array([0, 0], dtype='i')
        new_corp.context_data = [np.array([(2, )], dtype=[('idx', 'i')])]
        new_corp.context_types = ['document']
        new_corp.words = np.array(['0', '1'], dtype='i')
        new_corp.words_int = {'0': 0, '1': 1}

        m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2)
        m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i')
        m.word_top[:] = np.array([[1.01, 2.01], [2.01, 1.01]], dtype='d')
        m.top_doc[:] = np.array([[3.01, 0.01], [0.01, 3.01]], dtype='d')
        m.inv_top_sums[:] = 1. / m.word_top.sum(0)

        q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp)
        self.assertTrue(q.V == 2)
        self.assertTrue(q.K == 2)
        self.assertTrue(len(q.corpus) == 2)
        self.assertTrue((q.corpus == new_corp.corpus).all())
        self.assertTrue(len(q.indices) == 1)
        self.assertTrue(
            (q.indices == new_corp.view_metadata('document')['idx']).all())
        self.assertTrue(q.word_top.shape == (2, 2))
        self.assertTrue((q.word_top == m.word_top).all())
        self.assertTrue(q.top_doc.shape == (2, 1))
        self.assertTrue((q.top_doc == np.array([[0.01], [0.01]],
                                               dtype=q.top_doc.dtype)).all())
        self.assertTrue(q.inv_top_sums.shape == (2, ))
        self.assertTrue((q.inv_top_sums == m.inv_top_sums).all())
        self.assertTrue(q.alpha.shape == (2, 1))
        self.assertTrue((q.alpha == m.alpha).all())
        self.assertTrue(q.beta.shape == (2, 1))
        self.assertTrue((q.beta == m.beta).all())
Ejemplo n.º 14
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1, s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass