def test_LdaCgsSeq_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(type(m0.seed) == type(m1.seed)) print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0])) self.assertTrue(type(m0._mtrand_state[0]) == type(m1._mtrand_state[0])) self.assertTrue(type(m0._mtrand_state[1]) == type(m1._mtrand_state[1])) self.assertTrue(type(m0._mtrand_state[2]) == type(m1._mtrand_state[2])) self.assertTrue(type(m0._mtrand_state[3]) == type(m1._mtrand_state[3])) self.assertTrue(type(m0._mtrand_state[4]) == type(m1._mtrand_state[4])) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_LdaCgsSeq_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(type(m0.seed) == type(m1.seed)) print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0])) self.assertTrue( type(m0._mtrand_state[0]) == type(m1._mtrand_state[0])) self.assertTrue( type(m0._mtrand_state[1]) == type(m1._mtrand_state[1])) self.assertTrue( type(m0._mtrand_state[2]) == type(m1._mtrand_state[2])) self.assertTrue( type(m0._mtrand_state[3]) == type(m1._mtrand_state[3])) self.assertTrue( type(m0._mtrand_state[4]) == type(m1._mtrand_state[4])) finally: try: os.remove(tmp.name) except WindowsError: pass
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m = LdaCgsSeq(c, 'document', K=10) m.train(n_iterations=50, verbose=0) self.ldav = LdaCgsViewer(c, m)
def test_theta_and_phi(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=50, verbose=0) phi = m0.word_top / m0.word_top.sum(0) assert (phi.sum(axis=0).astype('float16') == 1.0).all() assert phi.sum(dtype='float16') == 10.0 theta = m0.top_doc / m0.top_doc.sum(0) assert (theta.sum(axis=0).astype('float16') == 1.0).all() assert theta.sum().astype('float16') == theta.shape[1]
def test_training_changes_something(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed) m0.train(n_iterations=1, verbose=0) m1.train(n_iterations=2, verbose=0) assert not (m0.Z == m1.Z).all() assert not (m0.word_top == m1.word_top).all() assert not (m0.inv_top_sums == m1.inv_top_sums).all() assert not (m0.top_doc == m1.top_doc).all()
def test_LdaCgsSeq_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all()) self.assertTrue(m0.seed == m1.seed) self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0]) self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all()) for s1,s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]): self.assertTrue(s1 == s2) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(not hasattr(m1, 'log_prob')) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_randomSeed(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) assert m0.seed is not None orig_seed = m0.seed m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed) assert m0.seed == m1.seed m0.train(n_iterations=50, verbose=0) m1.train(n_iterations=50, verbose=0) assert m0.seed == orig_seed assert m1.seed == orig_seed # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html assert m0._mtrand_state[0] == 'MT19937' assert m1._mtrand_state[0] == 'MT19937' assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all() assert m0._mtrand_state[2:] == m1._mtrand_state[2:] self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
def test_LdaCgsSeq_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all()) self.assertTrue(m0.seed == m1.seed) self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0]) self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all()) for s1, s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]): self.assertTrue(s1 == s2) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(not hasattr(m1, 'log_prob')) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_randomSeed(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) assert m0.seed is not None orig_seed = m0.seed m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed) assert m0.seed == m1.seed m0.train(n_iterations=50, verbose=0) m1.train(n_iterations=50, verbose=0) assert m0.seed == orig_seed assert m1.seed == orig_seed # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html assert m0._mtrand_state[0] == 'MT19937' assert m1._mtrand_state[0] == 'MT19937' assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all() assert m0._mtrand_state[2:] == m1._mtrand_state[2:] self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in xrange(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in xrange(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())