Esempio n. 1
0
 def testPersistence(self):
     model = rpmodel.RpModel(self.corpus, num_topics=2)
     model.save(testfile())
     model2 = rpmodel.RpModel.load(testfile())
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.projection, model2.projection))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
Esempio n. 2
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = rpmodel.RpModel(self.corpus, num_topics=2)
     model.save(fname)
     model2 = rpmodel.RpModel.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.projection, model2.projection))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
Esempio n. 3
0
 def test_persistence_compressed(self):
     fname = get_tmpfile('gensim_models.tst.gz')
     model = rpmodel.RpModel(self.corpus, num_topics=2)
     model.save(fname)
     model2 = rpmodel.RpModel.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(np.allclose(model.projection, model2.projection))
     tstvec = []
     self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
Esempio n. 4
0
    def testTransform(self):
        # create the transformation model
        numpy.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results)
        model = rpmodel.RpModel(self.corpus, num_topics=2)

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests

        expected = numpy.array([-0.70710677, 0.70710677])
        self.assertTrue(numpy.allclose(vec, expected)) # transformed entries must be equal up to sign
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))
    
    corpus = MmCorpus(config.resultFile('bow.mm'))

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        model.save(config.resultFile('model_tfidf.pkl'))
    elif method == 'lda':
        model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
        model.save(config.resultFile('model_lda.pkl'))
    elif method == 'lsi':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
        model.save(config.resultFile('model_lsi.pkl'))
    elif method == 'rp':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
        model.save(config.resultFile('model_rp.pkl'))
    else:
        raise ValueError('unknown topic extraction method: %s' % repr(method))
    
    MmCorpus.saveCorpus(config.resultFile('corpus_%s.mm' % method), model[corpus])
            
    logging.info("finished running %s" % program)

Esempio n. 6
0
 def testPersistence(self):
     model = rpmodel.RpModel(self.corpus, numTopics=2)
     model.save(testfile())
     model2 = rpmodel.RpModel.load(testfile())
     self.assertEqual(model.numTopics, model2.numTopics)
     self.assertTrue(numpy.allclose(model.projection, model2.projection))