Example #1
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beagleenvironment import BeagleEnvironment
        from vsm.model.beaglecontext import BeagleContextSeq
        from vsm.model.beagleorder import BeagleOrderSeq
        from vsm.model.beaglecomposite import BeagleComposite

        ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
        cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0, 50, 7)])
        e = BeagleEnvironment(ec, n_cols=5)
        e.train()

        cm = BeagleContextSeq(cc, ec, e.matrix)
        cm.train()

        om = BeagleOrderSeq(ec, e.matrix)
        om.train()

        m = BeagleComposite(cc, cm.matrix, ec, om.matrix)
        m.train()

        self.venv = BeagleViewer(ec, e)
        self.vctx = BeagleViewer(cc, cm)
        self.vord = BeagleViewer(ec, om)
        self.vcom = BeagleViewer(cc, m)
Example #2
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beagleenvironment import BeagleEnvironment
        from vsm.model.beaglecontext import BeagleContextSeq
        from vsm.model.beagleorder import BeagleOrderSeq
        from vsm.model.beaglecomposite import BeagleComposite

        ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
        cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)])
        e = BeagleEnvironment(ec, n_cols=5)
        e.train()

        cm = BeagleContextSeq(cc, ec, e.matrix)
        cm.train()

        om = BeagleOrderSeq(ec, e.matrix)
        om.train()

        m = BeagleComposite(cc, cm.matrix, ec, om.matrix)
        m.train()

        self.venv = BeagleViewer(ec, e)
        self.vctx = BeagleViewer(cc, cm)
        self.vord = BeagleViewer(ec, om)
        self.vcom = BeagleViewer(cc, m)
Example #3
0
 def test_LdaCgsMulti_eq_LdaCgsSeq_multi(self):
     from tempfile import NamedTemporaryFile
     import os
 
     c = random_corpus(1000, 50, 6, 100, seed=2)
     tmp = NamedTemporaryFile(delete=False, suffix='.npz')
     m0 = LdaCgsMulti(c, 'document', K=10, n_proc=1, seeds=[2])
     m1 = LdaCgsSeq(c, 'document', K=10, seed=2)
     for iteration in range(20):
         m0.train(n_iterations=2, verbose=0)
         m1.train(n_iterations=2, verbose=0)
         
         assert m0.context_type == m1.context_type
         assert m0.K == m1.K
         assert (m0.alpha == m1.alpha).all()
         assert (m0.beta == m1.beta).all()
         for i in range(max(len(m0.corpus), len(m1.corpus))):
             assert m0.corpus[i].all() == m1.corpus[i].all()
         assert m0.V == m1.V
         assert m0.iteration == m1.iteration
         assert (m0.Z[i] == m1.Z[i]).all()
         assert (m0.top_doc == m1.top_doc).all()
         assert (m0.word_top == m1.word_top).all()
 
         assert m0.seeds[0] == m1.seed
         assert m0._mtrand_states[0][0] == m1._mtrand_state[0]
         for s0,s1 in zip(m0._mtrand_states[0][1], m1._mtrand_state[1]):
             assert s0 == s1
         assert m0._mtrand_states[0][2] == m1._mtrand_state[2]
         assert m0._mtrand_states[0][3] == m1._mtrand_state[3]
         assert m0._mtrand_states[0][4] == m1._mtrand_state[4]
         print((iteration, m0.log_probs[-1], m1.log_probs[-1]))
         for i in range(iteration):
             assert np.isclose(m0.log_probs[i][1], m1.log_probs[i][1])
         assert (np.isclose(m0.inv_top_sums, m1.inv_top_sums)).all()
Example #4
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]),
                  type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(
                type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(
                type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(
                type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(
                type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Example #5
0
 def test_LdaCgsMulti_eq_LdaCgsSeq_multi(self):
     from tempfile import NamedTemporaryFile
     import os
 
     c = random_corpus(1000, 50, 6, 100, seed=2)
     tmp = NamedTemporaryFile(delete=False, suffix='.npz')
     m0 = LdaCgsMulti(c, 'document', K=10, n_proc=1, seeds=[2])
     m1 = LdaCgsSeq(c, 'document', K=10, seed=2)
     for iteration in range(20):
         m0.train(n_iterations=2, verbose=0)
         m1.train(n_iterations=2, verbose=0)
         
         assert m0.context_type == m1.context_type
         assert m0.K == m1.K
         assert (m0.alpha == m1.alpha).all()
         assert (m0.beta == m1.beta).all()
         for i in xrange(max(len(m0.corpus), len(m1.corpus))):
             assert m0.corpus[i].all() == m1.corpus[i].all()
         assert m0.V == m1.V
         assert m0.iteration == m1.iteration
         assert (m0.Z[i] == m1.Z[i]).all()
         assert (m0.top_doc == m1.top_doc).all()
         assert (m0.word_top == m1.word_top).all()
 
         assert m0.seeds[0] == m1.seed
         assert m0._mtrand_states[0][0] == m1._mtrand_state[0]
         for s0,s1 in zip(m0._mtrand_states[0][1], m1._mtrand_state[1]):
             assert s0 == s1
         assert m0._mtrand_states[0][2] == m1._mtrand_state[2]
         assert m0._mtrand_states[0][3] == m1._mtrand_state[3]
         assert m0._mtrand_states[0][4] == m1._mtrand_state[4]
         print iteration, m0.log_probs[-1], m1.log_probs[-1] 
         for i in range(iteration):
             assert np.isclose(m0.log_probs[i][1], m1.log_probs[i][1])
         assert (np.isclose(m0.inv_top_sums, m1.inv_top_sums)).all()
Example #6
0
    def test_apply_stoplist(self):

        from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='sentence',
                          metadata=True)
        new_c = apply_stoplist(c,
                               nltk_stop=False,
                               add_stop=['0', '1'],
                               freq=0,
                               in_place=False)

        li = [[], ['he', 'said'], ['he', 'said', 'bar'], ['bar', 'ate'],
              ['I', 'foo']]
        wc = corpus_fromlist(li, context_type='sentence')
        new_wc = apply_stoplist(wc, nltk_stop=True, freq=1, in_place=False)

        self.assertTrue('0' in c.words)
        self.assertTrue('1' in c.words)
        self.assertFalse('0' in new_c.words)
        self.assertFalse('1' in new_c.words)

        self.assertTrue('said' in new_wc.words)
        self.assertTrue('bar' in new_wc.words)
        self.assertFalse('he' in new_wc.words)
        self.assertFalse('foo' in new_wc.words)
        self.assertFalse('ate' in new_wc.words)
Example #7
0
    def test_LdaCgsSeq_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)

            self.assertTrue(type(m0.seed) == type(m1.seed))
            print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[0]) == type(m1._mtrand_state[0]))
            self.assertTrue(type(m0._mtrand_state[1]) == type(m1._mtrand_state[1]))
            self.assertTrue(type(m0._mtrand_state[2]) == type(m1._mtrand_state[2]))
            self.assertTrue(type(m0._mtrand_state[3]) == type(m1._mtrand_state[3]))
            self.assertTrue(type(m0._mtrand_state[4]) == type(m1._mtrand_state[4]))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Example #8
0
    def test_LdaCgsMulti_phi(self):
        c = random_corpus(1000, 50, 6, 100)

        m0 = LdaCgsMulti(c, 'document', K=10)
        m0.train(n_iterations=20)
        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0) == 1.0).all()
        assert phi.sum() == 10.0
Example #9
0
    def test_LdaCgsMulti_theta(self):
        c = random_corpus(1000, 50, 6, 100)

        m0 = LdaCgsMulti(c, 'document', K=10)
        m0.train(n_iterations=20)
        theta = self.model.top_doc / self.model.top_doc.sum(0)
        assert (theta.sum(axis=0) == 1.0).all()
        assert theta.sum() == theta.shape[1]
Example #10
0
    def test_LdaCgsMulti_phi(self):
        c = random_corpus(1000, 50, 6, 100)

        m0 = LdaCgsMulti(c, 'document', K=10)
        m0.train(n_iterations=20)
        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0) == 1.0).all()
        assert phi.sum() == 10.0
Example #11
0
    def test_LdaCgsMulti_theta(self):
        c = random_corpus(1000, 50, 6, 100)

        m0 = LdaCgsMulti(c, 'document', K=10)
        m0.train(n_iterations=20)
        theta = self.model.top_doc / self.model.top_doc.sum(0)
        assert (theta.sum(axis=0) == 1.0).all()
        assert theta.sum() == theta.shape[1]
Example #12
0
    def test_LdaCgsMulti_remove_Seq_props(self):
        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsMulti(c, 'document', K=10)

        assert getattr(m0, 'seed', None) is None
        assert getattr(m0, '_mtrand_state', None) is None
Example #13
0
    def test_LdaCgsMulti_remove_Seq_props(self):
        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsMulti(c, 'document', K=10)

        assert getattr(m0, 'seed', None) is None
        assert getattr(m0, '_mtrand_state', None) is None
Example #14
0
    def test_add_metadata(self):
        
        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True)
        n = c.view_metadata('sentence').size
        meta = ['m_{0}'.format(i) for i in range(n)]
        new_c = add_metadata(c, 'sentence', 'new_meta', meta)

        self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta)
Example #15
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.tf import TfSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True)

        m = TfSeq(c, 'document')
        m.train()

        self.v = TfViewer(c, m)
Example #16
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m = LdaCgsSeq(c, 'document', K=10)
        m.train(n_iterations=50, verbose=0)

        self.ldav = LdaCgsViewer(c, m)
Example #17
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beagleenvironment import BeagleEnvironment

        self.c = random_corpus(1000, 50, 0, 10, context_type='sentence')

        self.e = BeagleEnvironment(self.c, n_cols=100)
        self.e.train()

        self.ms = BeagleOrderSeq(self.c, self.e.matrix)
        self.ms.train()
        '''
Example #18
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beagleenvironment import BeagleEnvironment

        self.c = random_corpus(1000, 50, 0, 10, context_type='sentence')

        self.e = BeagleEnvironment(self.c, n_cols=100)
        self.e.train()

        self.ms = BeagleOrderSeq(self.c, self.e.matrix)
        self.ms.train()
        '''
Example #19
0
    def test_IndexedSymmArray(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq
        from vsm.viewer.ldacgsviewer import LdaCgsViewer

        c = random_corpus(50000, 1000, 0, 50)
        m = LdaCgsSeq(c, 'document', K=20)
        viewer = LdaCgsViewer(c, m)

        li = [0, 1, 10]
        isa = viewer.dismat_top(li)

        self.assertEqual(isa.shape[0], len(li))
Example #20
0
    def test_IndexedSymmArray(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq
        from vsm.viewer.ldacgsviewer import LdaCgsViewer

        c = random_corpus(50000, 1000, 0, 50)
        m = LdaCgsSeq(c, 'document', K=20)
        viewer = LdaCgsViewer(c, m)
        
        li = ['0', '1', '10']
        isa = viewer.dismat_top(li)
        
        self.assertEqual(isa.shape[0], len(li))
Example #21
0
    def setUp(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti 
        from vsm.model.beagleenvironment import BeagleEnvironment

        self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence')
        self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in xrange(0,50,7)])

        self.e = BeagleEnvironment(self.ec, n_cols=5)
        self.e.train()

        self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
        self.ms.train()
        '''
Example #22
0
    def test_BaseModel_IO(self):
           
        from tempfile import NamedTemporaryFile as NTF
        import os

        c = random_corpus(1000, 50, 6, 100)
        with NTF(delete=False, suffix='.npz') as tmp:
            m0 = BaseModel(c.corpus, 'context')
            m0.save(tmp.name)
            m1 = BaseModel.load(tmp.name)

            self.assertEqual(m0.context_type, m1.context_type)
            self.assertTrue((m0.matrix == m1.matrix).all())

        os.remove(tmp.name)
Example #23
0
    def setUp(self):
    
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.tf import TfSeq
        from vsm.model.tfidf import TfIdf

        c = random_corpus(1000, 50, 0, 10, context_type='document', metadata=True)

        tf = TfSeq(c, 'document')
        tf.train()

        m = TfIdf.from_tf(tf)
        m.train()

        self.v = TfIdfViewer(c, m)
Example #24
0
    def setUp(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti
        from vsm.model.beagleenvironment import BeagleEnvironment

        self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence')
        self.cc = self.ec.apply_stoplist(
            stoplist=[str(i) for i in range(0, 50, 7)])

        self.e = BeagleEnvironment(self.ec, n_cols=5)
        self.e.train()

        self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
        self.ms.train()
        '''
Example #25
0
    def test_BaseModel_IO(self):

        from tempfile import NamedTemporaryFile as NTF
        import os

        c = random_corpus(1000, 50, 6, 100)
        with NTF(delete=False, suffix='.npz') as tmp:
            m0 = BaseModel(c.corpus, 'context')
            m0.save(tmp.name)
            m1 = BaseModel.load(tmp.name)

            self.assertEqual(m0.context_type, m1.context_type)
            self.assertTrue((m0.matrix == m1.matrix).all())

        os.remove(tmp.name)
Example #26
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)
        
        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Example #27
0
    def setUp(self):

        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.tf import TfSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m = TfSeq(c, 'document')
        m.train()

        self.v = TfViewer(c, m)
Example #28
0
    def test_add_metadata(self):

        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='sentence',
                          metadata=True)
        n = c.view_metadata('sentence').size
        meta = ['m_{0}'.format(i) for i in range(n)]
        new_c = add_metadata(c, 'sentence', 'new_meta', meta)

        self.assertEqual(
            new_c.view_metadata('sentence')['new_meta'].tolist(), meta)
Example #29
0
    def test_LdaCgsMulti_continue_training(self):
        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsMulti(c, 'document', K=10)
        assert m0.seeds is not None
        orig_seeds = m0.seeds

        m1 = LdaCgsMulti(c, 'document', K=10, seeds=orig_seeds)
        assert m0.seeds == m1.seeds

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seeds == orig_seeds
        assert m1.seeds == orig_seeds
        for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states):
            assert (s0[1] != s1[1]).any()
            assert s0[2:] != s1[2:]

        m0.train(n_iterations=3, verbose=0)
        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states):
            assert s0[0] == 'MT19937'
            assert s1[0] == 'MT19937'
            assert (s0[1] == s1[1]).all()
            assert s0[2:] == s1[2:]

        assert m0.context_type == m1.context_type
        assert m0.K == m1.K
        assert (m0.alpha == m1.alpha).all()
        assert (m0.beta == m1.beta).all()
        assert m0.log_probs == m1.log_probs
        for i in range(max(len(m0.corpus), len(m1.corpus))):
            assert m0.corpus[i].all() == m1.corpus[i].all()
        assert m0.V == m1.V
        assert m0.iteration == m1.iteration
        for i in range(max(len(m0.Z), len(m1.Z))):
            assert m0.Z[i].all() == m1.Z[i].all()
        assert m0.top_doc.all() == m1.top_doc.all()
        assert m0.word_top.all() == m1.word_top.all()
        assert m0.inv_top_sums.all() == m1.inv_top_sums.all()
Example #30
0
    def test_continueTraining(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed
        assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any()
        assert m0._mtrand_state[2:] != m1._mtrand_state[2:]

        m0.train(n_iterations=3, verbose=0)

        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in range(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in range(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
Example #31
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Example #32
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1,s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)
            

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Example #33
0
    def test_LdaCgsMulti_continue_training(self):
        from vsm.corpus.util.corpusbuilders import random_corpus

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsMulti(c, 'document', K=10)
        assert m0.seeds is not None
        orig_seeds = m0.seeds

        m1 = LdaCgsMulti(c, 'document', K=10, seeds=orig_seeds)
        assert m0.seeds == m1.seeds

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seeds == orig_seeds
        assert m1.seeds == orig_seeds
        for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states):
            assert (s0[1] != s1[1]).any()
            assert s0[2:] != s1[2:]

        m0.train(n_iterations=3, verbose=0)
        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states):
            assert s0[0] == 'MT19937'
            assert s1[0] == 'MT19937'
            assert (s0[1] == s1[1]).all()
            assert s0[2:] == s1[2:]

        assert m0.context_type == m1.context_type
        assert m0.K == m1.K
        assert (m0.alpha == m1.alpha).all()
        assert (m0.beta == m1.beta).all()
        assert m0.log_probs == m1.log_probs
        for i in range(max(len(m0.corpus), len(m1.corpus))):
            assert m0.corpus[i].all() == m1.corpus[i].all()
        assert m0.V == m1.V
        assert m0.iteration == m1.iteration
        for i in range(max(len(m0.Z), len(m1.Z))):
            assert m0.Z[i].all() == m1.Z[i].all()
        assert m0.top_doc.all() == m1.top_doc.all()
        assert m0.word_top.all() == m1.word_top.all()
        assert m0.inv_top_sums.all() == m1.inv_top_sums.all()
Example #34
0
    def test_continueTraining(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000, 50, 0, 20, context_type='document',
                            metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        assert m0.seed is not None
        orig_seed = m0.seed

        m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed)
        assert m0.seed == m1.seed

        m0.train(n_iterations=2, verbose=0)
        m1.train(n_iterations=5, verbose=0)
        assert m0.seed == orig_seed
        assert m1.seed == orig_seed
        assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any()
        assert m0._mtrand_state[2:] != m1._mtrand_state[2:]

        m0.train(n_iterations=3, verbose=0)
        
        # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html
        assert m0._mtrand_state[0] == 'MT19937'
        assert m1._mtrand_state[0] == 'MT19937'
        assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all()
        assert m0._mtrand_state[2:] == m1._mtrand_state[2:]

        self.assertTrue(m0.context_type == m1.context_type)
        self.assertTrue(m0.K == m1.K)
        self.assertTrue((m0.alpha == m1.alpha).all())
        self.assertTrue((m0.beta == m1.beta).all())
        self.assertTrue(m0.log_probs == m1.log_probs)
        for i in xrange(max(len(m0.corpus), len(m1.corpus))):
            self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
        self.assertTrue(m0.V == m1.V)
        self.assertTrue(m0.iteration == m1.iteration)
        for i in xrange(max(len(m0.Z), len(m1.Z))):
            self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
        self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
        self.assertTrue(m0.word_top.all() == m1.word_top.all())
        self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
Example #35
0
    def test_LdaCgsSeq_IO(self):

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(m0.context_type == m1.context_type)
            self.assertTrue(m0.K == m1.K)
            self.assertTrue((m0.alpha == m1.alpha).all())
            self.assertTrue((m0.beta == m1.beta).all())
            self.assertTrue(m0.log_probs == m1.log_probs)
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all())
            self.assertTrue(m0.V == m1.V)
            self.assertTrue(m0.iteration == m1.iteration)
            for i in range(max(len(m0.Z), len(m1.Z))):
                self.assertTrue(m0.Z[i].all() == m1.Z[i].all())
            self.assertTrue(m0.top_doc.all() == m1.top_doc.all())
            self.assertTrue(m0.word_top.all() == m1.word_top.all())
            self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())

            self.assertTrue(m0.seed == m1.seed)
            self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0])
            self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all())
            for s1, s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]):
                self.assertTrue(s1 == s2)

            m0 = LdaCgsSeq(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsSeq.load(tmp.name)
            self.assertTrue(not hasattr(m1, 'log_prob'))
        finally:
            try:
                os.remove(tmp.name)
            except WindowsError:
                pass
Example #36
0
    def test_theta_and_phi(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)
        m0 = LdaCgsSeq(c, 'document', K=10)
        m0.train(n_iterations=50, verbose=0)

        phi = m0.word_top / m0.word_top.sum(0)
        assert (phi.sum(axis=0).astype('float16') == 1.0).all()
        assert phi.sum(dtype='float16') == 10.0

        theta = m0.top_doc / m0.top_doc.sum(0)
        assert (theta.sum(axis=0).astype('float16') == 1.0).all()
        assert theta.sum().astype('float16') == theta.shape[1]
Example #37
0
    def setUp(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beaglecomposite import BeagleComposite 
        from vsm.model.beagleenvironment import BeagleEnvironment
        from vsm.model.beaglecontext import BeagleContextSeq
        from vsm.model.beagleorder import BeagleOrderSeq

        self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
        self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in xrange(0,50,7)])

        self.e = BeagleEnvironment(self.ec, n_cols=5)
        self.e.train()

        self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
        self.cm.train()

        self.om = BeagleOrderSeq(self.ec, self.e.matrix)
        self.om.train()

        self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec, self.om.matrix)
        self.m.train()
Example #38
0
    def test_apply_stoplist(self):
        
        from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist

        c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True)
        new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], freq=0)

        li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']]
        wc = corpus_fromlist(li, context_type='sentence')
        new_wc = apply_stoplist(wc, nltk_stop=True, freq=1)
        
        self.assertTrue('0' in c.words)
        self.assertTrue('1' in c.words)
        self.assertFalse('0' in new_c.words)
        self.assertFalse('1' in new_c.words)

        self.assertTrue('said' in new_wc.words)
        self.assertTrue('bar' in new_wc.words)
        self.assertFalse('he' in new_wc.words)
        self.assertFalse('foo' in new_wc.words)
        self.assertFalse('ate' in new_wc.words)
Example #39
0
    def test_LdaCgsMulti_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)

            for s0, s1 in zip(m0.seeds, m1.seeds):
                assert type(s0) == type(s1)
            for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states):
                for i in range(5):
                    assert type(s0[i]) == type(s1[i])
        finally:
            os.remove(tmp.name)
Example #40
0
    def test_LdaCgsMulti_SeedTypes(self):
        """ Test for issue #74 issues. """

        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)

            for s0, s1 in zip(m0.seeds, m1.seeds):
                assert type(s0) == type(s1)
            for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states):
                for i in range(5):
                    assert type(s0[i]) == type(s1[i])
        finally:
            os.remove(tmp.name)
Example #41
0
    def test_training_changes_something(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.ldacgsseq import LdaCgsSeq

        c = random_corpus(1000,
                          50,
                          0,
                          20,
                          context_type='document',
                          metadata=True)

        m0 = LdaCgsSeq(c, 'document', K=10)
        m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed)

        m0.train(n_iterations=1, verbose=0)
        m1.train(n_iterations=2, verbose=0)

        assert not (m0.Z == m1.Z).all()
        assert not (m0.word_top == m1.word_top).all()
        assert not (m0.inv_top_sums == m1.inv_top_sums).all()
        assert not (m0.top_doc == m1.top_doc).all()
Example #42
0
    def test_LdaCgsMulti_IO(self):
        from tempfile import NamedTemporaryFile
        import os

        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)
            assert m0.context_type == m1.context_type
            assert m0.K == m1.K
            assert (m0.alpha == m1.alpha).all()
            assert (m0.beta == m1.beta).all()
            assert m0.log_probs == m1.log_probs
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                assert m0.corpus[i].all() == m1.corpus[i].all()
            assert m0.V == m1.V
            assert m0.iteration == m1.iteration
            for i in range(max(len(m0.Z), len(m1.Z))):
                assert m0.Z[i].all() == m1.Z[i].all()
            assert m0.top_doc.all() == m1.top_doc.all()
            assert m0.word_top.all() == m1.word_top.all()
            assert m0.inv_top_sums.all() == m1.inv_top_sums.all()

            assert m0.seeds == m1.seeds
            for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states):
                assert s0[0] == s1[0]
                assert (s0[1] == s1[1]).all()
                assert s0[2:] == s1[2:]

            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)
            assert not hasattr(m1, 'log_prob')
        finally:
            os.remove(tmp.name)
Example #43
0
    def test_LdaCgsMulti_IO(self):
        from tempfile import NamedTemporaryFile
        import os
    
        c = random_corpus(1000, 50, 6, 100)
        tmp = NamedTemporaryFile(delete=False, suffix='.npz')
        try:
            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)
            assert m0.context_type == m1.context_type
            assert m0.K == m1.K
            assert (m0.alpha == m1.alpha).all()
            assert (m0.beta == m1.beta).all()
            assert m0.log_probs == m1.log_probs
            for i in range(max(len(m0.corpus), len(m1.corpus))):
                assert m0.corpus[i].all() == m1.corpus[i].all()
            assert m0.V == m1.V
            assert m0.iteration == m1.iteration
            for i in range(max(len(m0.Z), len(m1.Z))):
                assert m0.Z[i].all() == m1.Z[i].all()
            assert m0.top_doc.all() == m1.top_doc.all()
            assert m0.word_top.all() == m1.word_top.all()
            assert m0.inv_top_sums.all() == m1.inv_top_sums.all()

            assert m0.seeds == m1.seeds
            for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states):
                assert s0[0] == s1[0]
                assert (s0[1] == s1[1]).all()
                assert s0[2:] == s1[2:]

            m0 = LdaCgsMulti(c, 'document', K=10)
            m0.train(n_iterations=20)
            m0.save(tmp.name)
            m1 = LdaCgsMulti.load(tmp.name)
            assert not hasattr(m1, 'log_prob')
        finally:
            os.remove(tmp.name)
Example #44
0
    def setUp(self):
        from vsm.corpus.util.corpusbuilders import random_corpus
        from vsm.model.beaglecomposite import BeagleComposite
        from vsm.model.beagleenvironment import BeagleEnvironment
        from vsm.model.beaglecontext import BeagleContextSeq
        from vsm.model.beagleorder import BeagleOrderSeq

        self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
        self.cc = self.ec.apply_stoplist(
            stoplist=[str(i) for i in range(0, 50, 7)])

        self.e = BeagleEnvironment(self.ec, n_cols=5)
        self.e.train()

        self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
        self.cm.train()

        self.om = BeagleOrderSeq(self.ec, self.e.matrix)
        self.om.train()

        self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec,
                                 self.om.matrix)
        self.m.train()
Example #45
0
 def setUp(self):
     self.c = random_corpus(1000, 50, 6, 100)
     self.m = BaseModel(self.c, 'context')
Example #46
0
 def setUp(self):
     self.c = random_corpus(1000, 50, 6, 100)
     self.m = BaseModel(self.c, 'context')