def test_constructor_with_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') print tmp_esa
"""LDA Model creation""" #build lda model lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token, num_topics=NUM_TOPICS, update_every=1, chunksize=10000, passes=2) #save trained model lda.save(options.prefix + '_lda.model') #save corpus as lda vectors in matrix market format corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf], progress_cnt=10000) #init lda-corpus reader mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm') """ESA Model creation""" #document titles article_titles = DocumentTitles.load(options.prefix + "_articles.txt") #build esa model esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles, num_features=NUM_TOPICS) esa.save(options.prefix + "_esa_on_lda.model") logger.info("finished transforming")
MM_BOW, id2word=CORPUS.dictionary, normalize=True) TF_IDF.save(TF_IDF_PATH) else: TF_IDF = models.TfidfModel.load(TF_IDF_PATH) TF_IDF_CORPUS_PATH = os.path.join( OPTIONS.prefix, language + "_tfidf_corpus.mm") if not os.path.exists(TF_IDF_CORPUS_PATH): corpora.MmCorpus.serialize( TF_IDF_CORPUS_PATH, TF_IDF[MM_BOW], progress_cnt=10000) MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH) LOGGER.info("Finished %s-TF-IDF Model Generation", language) ESA_PATH = os.path.join( OPTIONS.prefix, language + "_esa_on_tfidf.model") if not os.path.exists(ESA_PATH): ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH) ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES) ESA.save(ESA_PATH) LOGGER.info("Finished %s-ESA Model Generation", language) if language == 'en': SMALL_EN_ESA_PATH = os.path.join( OPTIONS.prefix, "small_en_esa_on_tfidf.model") if not os.path.exists(SMALL_EN_ESA_PATH): ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_concepts=NUM_TOPICS) ESA.save(SMALL_EN_ESA_PATH) LOGGER.info("Finished small en-ESA Model Generation") LOGGER.info("Finished ALL Transforming Activity")