def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beagleenvironment import BeagleEnvironment from vsm.model.beaglecontext import BeagleContextSeq from vsm.model.beagleorder import BeagleOrderSeq from vsm.model.beaglecomposite import BeagleComposite ec = random_corpus(1000, 50, 0, 20, context_type='sentence') cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0, 50, 7)]) e = BeagleEnvironment(ec, n_cols=5) e.train() cm = BeagleContextSeq(cc, ec, e.matrix) cm.train() om = BeagleOrderSeq(ec, e.matrix) om.train() m = BeagleComposite(cc, cm.matrix, ec, om.matrix) m.train() self.venv = BeagleViewer(ec, e) self.vctx = BeagleViewer(cc, cm) self.vord = BeagleViewer(ec, om) self.vcom = BeagleViewer(cc, m)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beagleenvironment import BeagleEnvironment from vsm.model.beaglecontext import BeagleContextSeq from vsm.model.beagleorder import BeagleOrderSeq from vsm.model.beaglecomposite import BeagleComposite ec = random_corpus(1000, 50, 0, 20, context_type='sentence') cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)]) e = BeagleEnvironment(ec, n_cols=5) e.train() cm = BeagleContextSeq(cc, ec, e.matrix) cm.train() om = BeagleOrderSeq(ec, e.matrix) om.train() m = BeagleComposite(cc, cm.matrix, ec, om.matrix) m.train() self.venv = BeagleViewer(ec, e) self.vctx = BeagleViewer(cc, cm) self.vord = BeagleViewer(ec, om) self.vcom = BeagleViewer(cc, m)
def test_LdaCgsMulti_eq_LdaCgsSeq_multi(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100, seed=2) tmp = NamedTemporaryFile(delete=False, suffix='.npz') m0 = LdaCgsMulti(c, 'document', K=10, n_proc=1, seeds=[2]) m1 = LdaCgsSeq(c, 'document', K=10, seed=2) for iteration in range(20): m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=2, verbose=0) assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() for i in range(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration assert (m0.Z[i] == m1.Z[i]).all() assert (m0.top_doc == m1.top_doc).all() assert (m0.word_top == m1.word_top).all() assert m0.seeds[0] == m1.seed assert m0._mtrand_states[0][0] == m1._mtrand_state[0] for s0,s1 in zip(m0._mtrand_states[0][1], m1._mtrand_state[1]): assert s0 == s1 assert m0._mtrand_states[0][2] == m1._mtrand_state[2] assert m0._mtrand_states[0][3] == m1._mtrand_state[3] assert m0._mtrand_states[0][4] == m1._mtrand_state[4] print((iteration, m0.log_probs[-1], m1.log_probs[-1])) for i in range(iteration): assert np.isclose(m0.log_probs[i][1], m1.log_probs[i][1]) assert (np.isclose(m0.inv_top_sums, m1.inv_top_sums)).all()
def test_LdaCgsSeq_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(type(m0.seed) == type(m1.seed)) print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0])) self.assertTrue( type(m0._mtrand_state[0]) == type(m1._mtrand_state[0])) self.assertTrue( type(m0._mtrand_state[1]) == type(m1._mtrand_state[1])) self.assertTrue( type(m0._mtrand_state[2]) == type(m1._mtrand_state[2])) self.assertTrue( type(m0._mtrand_state[3]) == type(m1._mtrand_state[3])) self.assertTrue( type(m0._mtrand_state[4]) == type(m1._mtrand_state[4])) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_LdaCgsMulti_eq_LdaCgsSeq_multi(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100, seed=2) tmp = NamedTemporaryFile(delete=False, suffix='.npz') m0 = LdaCgsMulti(c, 'document', K=10, n_proc=1, seeds=[2]) m1 = LdaCgsSeq(c, 'document', K=10, seed=2) for iteration in range(20): m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=2, verbose=0) assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() for i in xrange(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration assert (m0.Z[i] == m1.Z[i]).all() assert (m0.top_doc == m1.top_doc).all() assert (m0.word_top == m1.word_top).all() assert m0.seeds[0] == m1.seed assert m0._mtrand_states[0][0] == m1._mtrand_state[0] for s0,s1 in zip(m0._mtrand_states[0][1], m1._mtrand_state[1]): assert s0 == s1 assert m0._mtrand_states[0][2] == m1._mtrand_state[2] assert m0._mtrand_states[0][3] == m1._mtrand_state[3] assert m0._mtrand_states[0][4] == m1._mtrand_state[4] print iteration, m0.log_probs[-1], m1.log_probs[-1] for i in range(iteration): assert np.isclose(m0.log_probs[i][1], m1.log_probs[i][1]) assert (np.isclose(m0.inv_top_sums, m1.inv_top_sums)).all()
def test_apply_stoplist(self): from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0', '1'], freq=0, in_place=False) li = [[], ['he', 'said'], ['he', 'said', 'bar'], ['bar', 'ate'], ['I', 'foo']] wc = corpus_fromlist(li, context_type='sentence') new_wc = apply_stoplist(wc, nltk_stop=True, freq=1, in_place=False) self.assertTrue('0' in c.words) self.assertTrue('1' in c.words) self.assertFalse('0' in new_c.words) self.assertFalse('1' in new_c.words) self.assertTrue('said' in new_wc.words) self.assertTrue('bar' in new_wc.words) self.assertFalse('he' in new_wc.words) self.assertFalse('foo' in new_wc.words) self.assertFalse('ate' in new_wc.words)
def test_LdaCgsSeq_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(type(m0.seed) == type(m1.seed)) print("seed types:", type(m0._mtrand_state[0]), type(m1._mtrand_state[0])) self.assertTrue(type(m0._mtrand_state[0]) == type(m1._mtrand_state[0])) self.assertTrue(type(m0._mtrand_state[1]) == type(m1._mtrand_state[1])) self.assertTrue(type(m0._mtrand_state[2]) == type(m1._mtrand_state[2])) self.assertTrue(type(m0._mtrand_state[3]) == type(m1._mtrand_state[3])) self.assertTrue(type(m0._mtrand_state[4]) == type(m1._mtrand_state[4])) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_LdaCgsMulti_phi(self): c = random_corpus(1000, 50, 6, 100) m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) phi = m0.word_top / m0.word_top.sum(0) assert (phi.sum(axis=0) == 1.0).all() assert phi.sum() == 10.0
def test_LdaCgsMulti_theta(self): c = random_corpus(1000, 50, 6, 100) m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) theta = self.model.top_doc / self.model.top_doc.sum(0) assert (theta.sum(axis=0) == 1.0).all() assert theta.sum() == theta.shape[1]
def test_LdaCgsMulti_remove_Seq_props(self): from vsm.corpus.util.corpusbuilders import random_corpus c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsMulti(c, 'document', K=10) assert getattr(m0, 'seed', None) is None assert getattr(m0, '_mtrand_state', None) is None
def test_add_metadata(self): from vsm.corpus.util.corpusbuilders import random_corpus c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) n = c.view_metadata('sentence').size meta = ['m_{0}'.format(i) for i in range(n)] new_c = add_metadata(c, 'sentence', 'new_meta', meta) self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.tf import TfSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m = TfSeq(c, 'document') m.train() self.v = TfViewer(c, m)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m = LdaCgsSeq(c, 'document', K=10) m.train(n_iterations=50, verbose=0) self.ldav = LdaCgsViewer(c, m)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beagleenvironment import BeagleEnvironment self.c = random_corpus(1000, 50, 0, 10, context_type='sentence') self.e = BeagleEnvironment(self.c, n_cols=100) self.e.train() self.ms = BeagleOrderSeq(self.c, self.e.matrix) self.ms.train() '''
def test_IndexedSymmArray(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq from vsm.viewer.ldacgsviewer import LdaCgsViewer c = random_corpus(50000, 1000, 0, 50) m = LdaCgsSeq(c, 'document', K=20) viewer = LdaCgsViewer(c, m) li = [0, 1, 10] isa = viewer.dismat_top(li) self.assertEqual(isa.shape[0], len(li))
def test_IndexedSymmArray(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq from vsm.viewer.ldacgsviewer import LdaCgsViewer c = random_corpus(50000, 1000, 0, 50) m = LdaCgsSeq(c, 'document', K=20) viewer = LdaCgsViewer(c, m) li = ['0', '1', '10'] isa = viewer.dismat_top(li) self.assertEqual(isa.shape[0], len(li))
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti from vsm.model.beagleenvironment import BeagleEnvironment self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence') self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in xrange(0,50,7)]) self.e = BeagleEnvironment(self.ec, n_cols=5) self.e.train() self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix) self.ms.train() '''
def test_BaseModel_IO(self): from tempfile import NamedTemporaryFile as NTF import os c = random_corpus(1000, 50, 6, 100) with NTF(delete=False, suffix='.npz') as tmp: m0 = BaseModel(c.corpus, 'context') m0.save(tmp.name) m1 = BaseModel.load(tmp.name) self.assertEqual(m0.context_type, m1.context_type) self.assertTrue((m0.matrix == m1.matrix).all()) os.remove(tmp.name)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.tf import TfSeq from vsm.model.tfidf import TfIdf c = random_corpus(1000, 50, 0, 10, context_type='document', metadata=True) tf = TfSeq(c, 'document') tf.train() m = TfIdf.from_tf(tf) m.train() self.v = TfIdfViewer(c, m)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti from vsm.model.beagleenvironment import BeagleEnvironment self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence') self.cc = self.ec.apply_stoplist( stoplist=[str(i) for i in range(0, 50, 7)]) self.e = BeagleEnvironment(self.ec, n_cols=5) self.e.train() self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix) self.ms.train() '''
def test_theta_and_phi(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=50, verbose=0) phi = m0.word_top / m0.word_top.sum(0) assert (phi.sum(axis=0).astype('float16') == 1.0).all() assert phi.sum(dtype='float16') == 10.0 theta = m0.top_doc / m0.top_doc.sum(0) assert (theta.sum(axis=0).astype('float16') == 1.0).all() assert theta.sum().astype('float16') == theta.shape[1]
def test_add_metadata(self): from vsm.corpus.util.corpusbuilders import random_corpus c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) n = c.view_metadata('sentence').size meta = ['m_{0}'.format(i) for i in range(n)] new_c = add_metadata(c, 'sentence', 'new_meta', meta) self.assertEqual( new_c.view_metadata('sentence')['new_meta'].tolist(), meta)
def test_LdaCgsMulti_continue_training(self): from vsm.corpus.util.corpusbuilders import random_corpus c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsMulti(c, 'document', K=10) assert m0.seeds is not None orig_seeds = m0.seeds m1 = LdaCgsMulti(c, 'document', K=10, seeds=orig_seeds) assert m0.seeds == m1.seeds m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=5, verbose=0) assert m0.seeds == orig_seeds assert m1.seeds == orig_seeds for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states): assert (s0[1] != s1[1]).any() assert s0[2:] != s1[2:] m0.train(n_iterations=3, verbose=0) # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states): assert s0[0] == 'MT19937' assert s1[0] == 'MT19937' assert (s0[1] == s1[1]).all() assert s0[2:] == s1[2:] assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() assert m0.log_probs == m1.log_probs for i in range(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration for i in range(max(len(m0.Z), len(m1.Z))): assert m0.Z[i].all() == m1.Z[i].all() assert m0.top_doc.all() == m1.top_doc.all() assert m0.word_top.all() == m1.word_top.all() assert m0.inv_top_sums.all() == m1.inv_top_sums.all()
def test_continueTraining(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) assert m0.seed is not None orig_seed = m0.seed m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed) assert m0.seed == m1.seed m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=5, verbose=0) assert m0.seed == orig_seed assert m1.seed == orig_seed assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any() assert m0._mtrand_state[2:] != m1._mtrand_state[2:] m0.train(n_iterations=3, verbose=0) # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html assert m0._mtrand_state[0] == 'MT19937' assert m1._mtrand_state[0] == 'MT19937' assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all() assert m0._mtrand_state[2:] == m1._mtrand_state[2:] self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
def test_training_changes_something(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) m1 = LdaCgsSeq(c, 'document', K=10, seed=m0.seed) m0.train(n_iterations=1, verbose=0) m1.train(n_iterations=2, verbose=0) assert not (m0.Z == m1.Z).all() assert not (m0.word_top == m1.word_top).all() assert not (m0.inv_top_sums == m1.inv_top_sums).all() assert not (m0.top_doc == m1.top_doc).all()
def test_LdaCgsSeq_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all()) self.assertTrue(m0.seed == m1.seed) self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0]) self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all()) for s1,s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]): self.assertTrue(s1 == s2) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(not hasattr(m1, 'log_prob')) finally: try: os.remove(tmp.name) except WindowsError: pass
def test_LdaCgsMulti_continue_training(self): from vsm.corpus.util.corpusbuilders import random_corpus c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsMulti(c, 'document', K=10) assert m0.seeds is not None orig_seeds = m0.seeds m1 = LdaCgsMulti(c, 'document', K=10, seeds=orig_seeds) assert m0.seeds == m1.seeds m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=5, verbose=0) assert m0.seeds == orig_seeds assert m1.seeds == orig_seeds for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states): assert (s0[1] != s1[1]).any() assert s0[2:] != s1[2:] m0.train(n_iterations=3, verbose=0) # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states): assert s0[0] == 'MT19937' assert s1[0] == 'MT19937' assert (s0[1] == s1[1]).all() assert s0[2:] == s1[2:] assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() assert m0.log_probs == m1.log_probs for i in range(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration for i in range(max(len(m0.Z), len(m1.Z))): assert m0.Z[i].all() == m1.Z[i].all() assert m0.top_doc.all() == m1.top_doc.all() assert m0.word_top.all() == m1.word_top.all() assert m0.inv_top_sums.all() == m1.inv_top_sums.all()
def test_continueTraining(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.ldacgsseq import LdaCgsSeq c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) m0 = LdaCgsSeq(c, 'document', K=10) assert m0.seed is not None orig_seed = m0.seed m1 = LdaCgsSeq(c, 'document', K=10, seed=orig_seed) assert m0.seed == m1.seed m0.train(n_iterations=2, verbose=0) m1.train(n_iterations=5, verbose=0) assert m0.seed == orig_seed assert m1.seed == orig_seed assert (m0._mtrand_state[1] != m1._mtrand_state[1]).any() assert m0._mtrand_state[2:] != m1._mtrand_state[2:] m0.train(n_iterations=3, verbose=0) # ref:http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.get_state.html assert m0._mtrand_state[0] == 'MT19937' assert m1._mtrand_state[0] == 'MT19937' assert (m0._mtrand_state[1] == m1._mtrand_state[1]).all() assert m0._mtrand_state[2:] == m1._mtrand_state[2:] self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in xrange(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in xrange(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all())
def test_LdaCgsSeq_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(m0.context_type == m1.context_type) self.assertTrue(m0.K == m1.K) self.assertTrue((m0.alpha == m1.alpha).all()) self.assertTrue((m0.beta == m1.beta).all()) self.assertTrue(m0.log_probs == m1.log_probs) for i in range(max(len(m0.corpus), len(m1.corpus))): self.assertTrue(m0.corpus[i].all() == m1.corpus[i].all()) self.assertTrue(m0.V == m1.V) self.assertTrue(m0.iteration == m1.iteration) for i in range(max(len(m0.Z), len(m1.Z))): self.assertTrue(m0.Z[i].all() == m1.Z[i].all()) self.assertTrue(m0.top_doc.all() == m1.top_doc.all()) self.assertTrue(m0.word_top.all() == m1.word_top.all()) self.assertTrue(m0.inv_top_sums.all() == m1.inv_top_sums.all()) self.assertTrue(m0.seed == m1.seed) self.assertTrue(m0._mtrand_state[0] == m1._mtrand_state[0]) self.assertTrue((m0._mtrand_state[1] == m1._mtrand_state[1]).all()) for s1, s2 in zip(m0._mtrand_state[2:], m1._mtrand_state[2:]): self.assertTrue(s1 == s2) m0 = LdaCgsSeq(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsSeq.load(tmp.name) self.assertTrue(not hasattr(m1, 'log_prob')) finally: try: os.remove(tmp.name) except WindowsError: pass
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beaglecomposite import BeagleComposite from vsm.model.beagleenvironment import BeagleEnvironment from vsm.model.beaglecontext import BeagleContextSeq from vsm.model.beagleorder import BeagleOrderSeq self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence') self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in xrange(0,50,7)]) self.e = BeagleEnvironment(self.ec, n_cols=5) self.e.train() self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix) self.cm.train() self.om = BeagleOrderSeq(self.ec, self.e.matrix) self.om.train() self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec, self.om.matrix) self.m.train()
def test_apply_stoplist(self): from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], freq=0) li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']] wc = corpus_fromlist(li, context_type='sentence') new_wc = apply_stoplist(wc, nltk_stop=True, freq=1) self.assertTrue('0' in c.words) self.assertTrue('1' in c.words) self.assertFalse('0' in new_c.words) self.assertFalse('1' in new_c.words) self.assertTrue('said' in new_wc.words) self.assertTrue('bar' in new_wc.words) self.assertFalse('he' in new_wc.words) self.assertFalse('foo' in new_wc.words) self.assertFalse('ate' in new_wc.words)
def test_LdaCgsMulti_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) for s0, s1 in zip(m0.seeds, m1.seeds): assert type(s0) == type(s1) for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states): for i in range(5): assert type(s0[i]) == type(s1[i]) finally: os.remove(tmp.name)
def test_LdaCgsMulti_SeedTypes(self): """ Test for issue #74 issues. """ from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) for s0, s1 in zip(m0.seeds, m1.seeds): assert type(s0) == type(s1) for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states): for i in range(5): assert type(s0[i]) == type(s1[i]) finally: os.remove(tmp.name)
def test_LdaCgsMulti_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() assert m0.log_probs == m1.log_probs for i in range(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration for i in range(max(len(m0.Z), len(m1.Z))): assert m0.Z[i].all() == m1.Z[i].all() assert m0.top_doc.all() == m1.top_doc.all() assert m0.word_top.all() == m1.word_top.all() assert m0.inv_top_sums.all() == m1.inv_top_sums.all() assert m0.seeds == m1.seeds for s0, s1 in zip(m0._mtrand_states, m1._mtrand_states): assert s0[0] == s1[0] assert (s0[1] == s1[1]).all() assert s0[2:] == s1[2:] m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) assert not hasattr(m1, 'log_prob') finally: os.remove(tmp.name)
def test_LdaCgsMulti_IO(self): from tempfile import NamedTemporaryFile import os c = random_corpus(1000, 50, 6, 100) tmp = NamedTemporaryFile(delete=False, suffix='.npz') try: m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) assert m0.context_type == m1.context_type assert m0.K == m1.K assert (m0.alpha == m1.alpha).all() assert (m0.beta == m1.beta).all() assert m0.log_probs == m1.log_probs for i in range(max(len(m0.corpus), len(m1.corpus))): assert m0.corpus[i].all() == m1.corpus[i].all() assert m0.V == m1.V assert m0.iteration == m1.iteration for i in range(max(len(m0.Z), len(m1.Z))): assert m0.Z[i].all() == m1.Z[i].all() assert m0.top_doc.all() == m1.top_doc.all() assert m0.word_top.all() == m1.word_top.all() assert m0.inv_top_sums.all() == m1.inv_top_sums.all() assert m0.seeds == m1.seeds for s0, s1 in zip(m0._mtrand_states,m1._mtrand_states): assert s0[0] == s1[0] assert (s0[1] == s1[1]).all() assert s0[2:] == s1[2:] m0 = LdaCgsMulti(c, 'document', K=10) m0.train(n_iterations=20) m0.save(tmp.name) m1 = LdaCgsMulti.load(tmp.name) assert not hasattr(m1, 'log_prob') finally: os.remove(tmp.name)
def setUp(self): from vsm.corpus.util.corpusbuilders import random_corpus from vsm.model.beaglecomposite import BeagleComposite from vsm.model.beagleenvironment import BeagleEnvironment from vsm.model.beaglecontext import BeagleContextSeq from vsm.model.beagleorder import BeagleOrderSeq self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence') self.cc = self.ec.apply_stoplist( stoplist=[str(i) for i in range(0, 50, 7)]) self.e = BeagleEnvironment(self.ec, n_cols=5) self.e.train() self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix) self.cm.train() self.om = BeagleOrderSeq(self.ec, self.e.matrix) self.om.train() self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec, self.om.matrix) self.m.train()
def setUp(self): self.c = random_corpus(1000, 50, 6, 100) self.m = BaseModel(self.c, 'context')