def test_parallel(self): """Test doc2vec parallel training.""" corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000) for workers in [2, 4]: model = doc2vec.Doc2Vec(corpus, workers=workers) self.model_sanity(model)
def test_parallel(self): """Test doc2vec parallel training with more than default 3 threads.""" # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words) corpus = utils.RepeatCorpus(DocsLeeCorpus(), 6000) # use smaller batches-to-workers for more contention model = doc2vec.Doc2Vec(corpus, workers=6, batch_words=5000) self.model_sanity(model)
def testParallel(self): """Test word2vec parallel training.""" if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) return corpus = utils.RepeatCorpus(LeeCorpus(), 10000) for workers in [2, 4]: model = word2vec.Word2Vec(corpus, workers=workers) sims = model.most_similar('israeli')
def test_parallel(self): """Test doc2vec parallel training.""" if doc2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) return corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000) for workers in [2, 4]: model = doc2vec.Doc2Vec(corpus, workers=workers) self.model_sanity(model)
def prepare_model(subject): print('Preparing model') dictionary = corpora.Dictionary(texts) dictionary.save('models/LDA/%s/dictionary.dict' % (subject, )) # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('models/LDA/%s/corpus.mm' % (subject, ), corpus) # store to disk, for later use id2word = corpora.Dictionary.load('models/LDA/%s/dictionary.dict' % (subject, )) mm = corpora.MmCorpus("models/LDA/%s/corpus.mm" % (subject, )) tfidf = models.TfidfModel(mm) # step 1 -- initialize a model tfidf.save('models/LDA/%s/tfidf_model' % (subject, )) corpus_tfidf = tfidf[mm] corpora.MmCorpus.serialize('models/LDA/%s/corpus_tfidf.mm' % (subject, ), corpus_tfidf) # store to disk, for later use lda_tfidf = models.LdaModel(corpus=utils.RepeatCorpus(corpus_tfidf, 10000), id2word=id2word, num_topics=len(documents), update_every=1, chunksize=1000, passes=2, iterations=1000) print('Model Prepared for tfidf') lda = models.LdaModel( corpus=utils.RepeatCorpus(mm, 10000), id2word=id2word, update_every=1, num_topics=len(documents), chunksize=1000, passes=2, iterations=1000) print('Model prepared for bow') lda.save('models/LDA/%s/lda.model' % (subject, )) lda_tfidf.save('models/LDA/%s/lda_tfidf.model' % (subject, )) print('Prepare and save index similarities for Bow and Tfidf') corpus_tfidf = corpora.MmCorpus("models/LDA/%s/corpus_tfidf.mm" % (subject, )) index = similarities.MatrixSimilarity(lda[mm]) index_tfidf = similarities.MatrixSimilarity(lda_tfidf[corpus_tfidf], num_features=corpus_tfidf.num_terms) index.save("models/LDA/%s/simIndex.index" % (subject, )) index_tfidf.save("models/LDA/%s/simIndex_tfidf.index" % (subject, ))
def testParallel(self): """Test word2vec parallel training.""" if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) return corpus = utils.RepeatCorpus(LeeCorpus(), 10000) for workers in [2, 4]: model = word2vec.Word2Vec(corpus, workers=workers) sims = model.most_similar('israeli') # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top3 self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])
dictionary = corpora.Dictionary(texts) dictionary.save('models/LSA/%s/dictionary.dict' %subject) # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('models/LSA/%s/corpus.mm' %subject, corpus) # store to disk, for later use id2word = corpora.Dictionary.load('models/LSA/%s/dictionary.dict' %subject) mm = corpora.MmCorpus("models/LSA/%s/corpus.mm" %subject) tfidf = models.TfidfModel(mm, normalize=True) # creates a lazy evaluating wrapper around corpus corpus_tfidf = tfidf[mm] if TRAIN: lsi_tfidf = models.LsiModel(utils.RepeatCorpus(corpus_tfidf, 10000), id2word=id2word, onepass=False, extra_samples=300, num_topics=len(documents)) lsi_tfidf.save('models/LSA/%s/lsi.model' %subject) print('Done training') lsi_tfidf = models.LsiModel.load('models/LSA/%s/lsi.model' %subject) corpus_lsi_tfidf = lsi_tfidf[corpus_tfidf] index = similarities.MatrixSimilarity(corpus_lsi_tfidf, num_features=lsi_tfidf.num_topics) index.save('models/LSA/%s/lsi.index' %subject) ''' queries = ["How would you arrange 1000 numbers such that each number is smaller than the one to its right?",