def __init__(self, prefix): Extractor.__init__(self) logger.info( "Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix) self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict") self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model") # self.lda_model = models.LdaModel.load(prefix + "_lda.model") self.esa_model = EsaModel.load(prefix + "_esa1000_on_tfidf.model")
def load_esa_models(self): logger.info("Loading ESA models") self.esas = { # 'en': EsaModel.load(self.prefix + self.target_prefix + "en_esa_on_tfidf.model"), 'es': EsaModel.load(self.prefix + "es_esa_on_tfidf.model"), # 'de': EsaModel.load(self.prefix + "de_esa_on_tfidf.model"), # 'nl': EsaModel.load(self.prefix + "nl_esa_on_tfidf.model") }
def test_constructor_with_big_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/vagrant/data/wiki_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/vagrant/data/wiki_wordids.dict") #load article titles document_titles = DocumentTitles.load("/vagrant/data/wiki_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters=15, document_titles=document_titles, num_features=len(id2token)) print esa_model esa_model.save('/vagrant/data/wiki_cesa.model') tmp_esa = EsaModel.load('/vagrant/data/wiki_cesa.model') print tmp_esa
from config import CONFIG import os from gensim import models from gensim import corpora from nyan.shared_modules.feature_extractor.esa.esamodel import EsaModel from nyan.shared_modules.feature_extractor.esa.document_titles import DocumentTitles from wikiextract.mappings import generate_mappings language = "en" NUM_TOPICS = 2000 TF_IDF_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf.model") TF_IDF = models.TfidfModel.load(TF_IDF_PATH) TF_IDF_CORPUS_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf_corpus.mm") MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH) ARTICLES_PATH = os.path.join(CONFIG['prefix'], language + "_articles.txt") ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH) SMALL_EN_ESA_PATH = os.path.join(CONFIG['prefix'], "en_esa%d_on_tfidf.model" % NUM_TOPICS) SMALL_EN_ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_features=50000, num_concepts=NUM_TOPICS, lang='en') SMALL_EN_ESA.save(SMALL_EN_ESA_PATH) SMALL_EN_ESA = EsaModel.load(SMALL_EN_ESA_PATH) # Regenerate mappings generate_mappings()