def test_parallel(self):
        """Test doc2vec parallel training."""
        corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000)

        for workers in [2, 4]:
            model = doc2vec.Doc2Vec(corpus, workers=workers)
            self.model_sanity(model)
Example #2
0
    def test_parallel(self):
        """Test doc2vec parallel training with more than default 3 threads."""
        # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words)
        corpus = utils.RepeatCorpus(DocsLeeCorpus(), 6000)

        # use smaller batches-to-workers for more contention
        model = doc2vec.Doc2Vec(corpus, workers=6, batch_words=5000)
        self.model_sanity(model)
Example #3
0
    def testParallel(self):
        """Test word2vec parallel training."""
        if word2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(LeeCorpus(), 10000)

        for workers in [2, 4]:
            model = word2vec.Word2Vec(corpus, workers=workers)
            sims = model.most_similar('israeli')
Example #4
0
    def test_parallel(self):
        """Test doc2vec parallel training."""
        if doc2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000)

        for workers in [2, 4]:
            model = doc2vec.Doc2Vec(corpus, workers=workers)
            self.model_sanity(model)
def prepare_model(subject):
    print('Preparing model')
    dictionary = corpora.Dictionary(texts)
    dictionary.save('models/LDA/%s/dictionary.dict' % (subject, ))  # store the dictionary, for future reference
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('models/LDA/%s/corpus.mm' % (subject, ), corpus)  # store to disk, for later use


    id2word = corpora.Dictionary.load('models/LDA/%s/dictionary.dict' % (subject, ))
    mm = corpora.MmCorpus("models/LDA/%s/corpus.mm" % (subject, ))
    tfidf = models.TfidfModel(mm) # step 1 -- initialize a model
    tfidf.save('models/LDA/%s/tfidf_model' % (subject, ))
    corpus_tfidf = tfidf[mm]
    corpora.MmCorpus.serialize('models/LDA/%s/corpus_tfidf.mm' % (subject, ), corpus_tfidf)  # store to disk, for later use

    lda_tfidf = models.LdaModel(corpus=utils.RepeatCorpus(corpus_tfidf, 10000),
                             id2word=id2word,
                             num_topics=len(documents),
                             update_every=1,
                             chunksize=1000,
                             passes=2,
                             iterations=1000)
    print('Model Prepared for tfidf')
    lda = models.LdaModel( corpus=utils.RepeatCorpus(mm, 10000),
                        id2word=id2word,
                        update_every=1,
                        num_topics=len(documents),
                        chunksize=1000,
                        passes=2,
                        iterations=1000)
    print('Model prepared for bow')
    lda.save('models/LDA/%s/lda.model' % (subject, ))
    lda_tfidf.save('models/LDA/%s/lda_tfidf.model' % (subject, ))
    print('Prepare and save index similarities for Bow and Tfidf')

    corpus_tfidf = corpora.MmCorpus("models/LDA/%s/corpus_tfidf.mm" % (subject, ))
    index = similarities.MatrixSimilarity(lda[mm])
    index_tfidf = similarities.MatrixSimilarity(lda_tfidf[corpus_tfidf], num_features=corpus_tfidf.num_terms)
    index.save("models/LDA/%s/simIndex.index" % (subject, ))
    index_tfidf.save("models/LDA/%s/simIndex_tfidf.index" % (subject, ))
Example #6
0
    def testParallel(self):
        """Test word2vec parallel training."""
        if word2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(LeeCorpus(), 10000)

        for workers in [2, 4]:
            model = word2vec.Word2Vec(corpus, workers=workers)
            sims = model.most_similar('israeli')
            # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization
            # so let's test only for top3
            self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])
    dictionary = corpora.Dictionary(texts)
    dictionary.save('models/LSA/%s/dictionary.dict' %subject)  # store the dictionary, for future reference
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('models/LSA/%s/corpus.mm' %subject, corpus)  # store to disk, for later use

id2word = corpora.Dictionary.load('models/LSA/%s/dictionary.dict' %subject)
mm = corpora.MmCorpus("models/LSA/%s/corpus.mm" %subject)

tfidf = models.TfidfModel(mm, normalize=True)

# creates a lazy evaluating wrapper around corpus 
corpus_tfidf = tfidf[mm]

if TRAIN:
    lsi_tfidf = models.LsiModel(utils.RepeatCorpus(corpus_tfidf, 10000), 
                                id2word=id2word, 
                                onepass=False,
                                extra_samples=300,
                                num_topics=len(documents))
    lsi_tfidf.save('models/LSA/%s/lsi.model' %subject)
    print('Done training')

lsi_tfidf = models.LsiModel.load('models/LSA/%s/lsi.model' %subject)
corpus_lsi_tfidf = lsi_tfidf[corpus_tfidf]
index = similarities.MatrixSimilarity(corpus_lsi_tfidf, num_features=lsi_tfidf.num_topics)
index.save('models/LSA/%s/lsi.index' %subject)

'''
queries = ["How would you arrange 1000 numbers such that each number is smaller than the one to its right?",