Esempio n. 1
0
 def __init__(self, prefix):
     Extractor.__init__(self)
     logger.info(
         "Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix)
     self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict")
     self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model")
     # self.lda_model = models.LdaModel.load(prefix + "_lda.model")
     self.esa_model = EsaModel.load(prefix + "_esa1000_on_tfidf.model")
Esempio n. 2
0
    def load_esa_models(self):
        logger.info("Loading ESA models")

        self.esas = {
            # 'en': EsaModel.load(self.prefix + self.target_prefix + "en_esa_on_tfidf.model"),
            'es': EsaModel.load(self.prefix + "es_esa_on_tfidf.model"),
            # 'de': EsaModel.load(self.prefix + "de_esa_on_tfidf.model"),
            # 'nl': EsaModel.load(self.prefix + "nl_esa_on_tfidf.model")
        }
Esempio n. 3
0
    def test_constructor_with_big_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/vagrant/data/wiki_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/vagrant/data/wiki_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/vagrant/data/wiki_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters=15, document_titles=document_titles, num_features=len(id2token))
        
        print esa_model
        
        esa_model.save('/vagrant/data/wiki_cesa.model')
        
        tmp_esa = EsaModel.load('/vagrant/data/wiki_cesa.model')
        print tmp_esa  
from config import CONFIG
import os
from gensim import models
from gensim import corpora
from nyan.shared_modules.feature_extractor.esa.esamodel import EsaModel
from nyan.shared_modules.feature_extractor.esa.document_titles import DocumentTitles

from wikiextract.mappings import generate_mappings

language = "en"
NUM_TOPICS = 2000

TF_IDF_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf.model")
TF_IDF = models.TfidfModel.load(TF_IDF_PATH)

TF_IDF_CORPUS_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf_corpus.mm")
MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH)

ARTICLES_PATH = os.path.join(CONFIG['prefix'], language + "_articles.txt")
ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH)

SMALL_EN_ESA_PATH = os.path.join(CONFIG['prefix'], "en_esa%d_on_tfidf.model" % NUM_TOPICS)
SMALL_EN_ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_features=50000,
                num_concepts=NUM_TOPICS, lang='en')
SMALL_EN_ESA.save(SMALL_EN_ESA_PATH)

SMALL_EN_ESA = EsaModel.load(SMALL_EN_ESA_PATH)

# Regenerate mappings
generate_mappings()