def test_constructor_with_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') print tmp_esa
"""LDA Model creation""" #build lda model lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token, num_topics=NUM_TOPICS, update_every=1, chunksize=10000, passes=2) #save trained model lda.save(options.prefix + '_lda.model') #save corpus as lda vectors in matrix market format corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf], progress_cnt=10000) #init lda-corpus reader mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm') """ESA Model creation""" #document titles article_titles = DocumentTitles.load(options.prefix + "_articles.txt") #build esa model esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles, num_features=NUM_TOPICS) esa.save(options.prefix + "_esa_on_lda.model") logger.info("finished transforming")