def test_constructor_with_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') print tmp_esa
def __init__(self, prefix): """ prefix is the prefix path to tfidf, lda and esa model. """ logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix) self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict") self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model") self.lda_model = models.LdaModel.load(prefix + "_lda.model") self.esa_model = EsaModel.load(prefix + "_esa_on_lda.model")
"""LDA Model creation""" #build lda model lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token, num_topics=NUM_TOPICS, update_every=1, chunksize=10000, passes=2) #save trained model lda.save(options.prefix + '_lda.model') #save corpus as lda vectors in matrix market format corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf], progress_cnt=10000) #init lda-corpus reader mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm') """ESA Model creation""" #document titles article_titles = DocumentTitles.load(options.prefix + "_articles.txt") #build esa model esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles, num_features=NUM_TOPICS) esa.save(options.prefix + "_esa_on_lda.model") logger.info("finished transforming")
logger.info("Load text file %s" % options.text) try: with open(options.text, "r") as file: doc = " ".join(file.readlines()) except Exception as e: logger.error("Could not load document from %s" % options.text) sys.exit(1) #load dictionary, tfidf model, lda model, esa model logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % options.prefix) dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict") tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model") lda_model = models.LdaModel.load(options.prefix + "_lda.model") esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model") #create list of tokens from doc logger.info("Lemmatize document.") tokens = utils.lemmatize(doc) #create bow of doc from token list logger.info("Create bag-of-words representation from document.") doc_bow = dictionary.doc2bow(tokens) #create tfidf representation from bag-of-words logger.info("Transform to tfidf.") doc_tfidf = tfidf_model[doc_bow] #create lda representation from tfidf logger.info("Transform to lda")
MM_BOW, id2word=CORPUS.dictionary, normalize=True) TF_IDF.save(TF_IDF_PATH) else: TF_IDF = models.TfidfModel.load(TF_IDF_PATH) TF_IDF_CORPUS_PATH = os.path.join( OPTIONS.prefix, language + "_tfidf_corpus.mm") if not os.path.exists(TF_IDF_CORPUS_PATH): corpora.MmCorpus.serialize( TF_IDF_CORPUS_PATH, TF_IDF[MM_BOW], progress_cnt=10000) MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH) LOGGER.info("Finished %s-TF-IDF Model Generation", language) ESA_PATH = os.path.join( OPTIONS.prefix, language + "_esa_on_tfidf.model") if not os.path.exists(ESA_PATH): ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH) ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES) ESA.save(ESA_PATH) LOGGER.info("Finished %s-ESA Model Generation", language) if language == 'en': SMALL_EN_ESA_PATH = os.path.join( OPTIONS.prefix, "small_en_esa_on_tfidf.model") if not os.path.exists(SMALL_EN_ESA_PATH): ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_concepts=NUM_TOPICS) ESA.save(SMALL_EN_ESA_PATH) LOGGER.info("Finished small en-ESA Model Generation") LOGGER.info("Finished ALL Transforming Activity")