Esempio n. 1
0
 def __init__(self, prefix):
     Extractor.__init__(self)
     logger.info(
         "Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix)
     self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict")
     self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model")
     # self.lda_model = models.LdaModel.load(prefix + "_lda.model")
     self.esa_model = EsaModel.load(prefix + "_esa1000_on_tfidf.model")
Esempio n. 2
0
    def load_esa_models(self):
        logger.info("Loading ESA models")

        self.esas = {
            # 'en': EsaModel.load(self.prefix + self.target_prefix + "en_esa_on_tfidf.model"),
            'es': EsaModel.load(self.prefix + "es_esa_on_tfidf.model"),
            # 'de': EsaModel.load(self.prefix + "de_esa_on_tfidf.model"),
            # 'nl': EsaModel.load(self.prefix + "nl_esa_on_tfidf.model")
        }
Esempio n. 3
0
    def test_constructor_with_big_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/vagrant/data/wiki_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/vagrant/data/wiki_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/vagrant/data/wiki_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters=15, document_titles=document_titles, num_features=len(id2token))
        
        print esa_model
        
        esa_model.save('/vagrant/data/wiki_cesa.model')
        
        tmp_esa = EsaModel.load('/vagrant/data/wiki_cesa.model')
        print tmp_esa  
from config import CONFIG
import os
from gensim import models
from gensim import corpora
from nyan.shared_modules.feature_extractor.esa.esamodel import EsaModel
from nyan.shared_modules.feature_extractor.esa.document_titles import DocumentTitles

from wikiextract.mappings import generate_mappings

language = "en"
NUM_TOPICS = 2000

TF_IDF_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf.model")
TF_IDF = models.TfidfModel.load(TF_IDF_PATH)

TF_IDF_CORPUS_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf_corpus.mm")
MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH)

ARTICLES_PATH = os.path.join(CONFIG['prefix'], language + "_articles.txt")
ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH)

SMALL_EN_ESA_PATH = os.path.join(CONFIG['prefix'], "en_esa%d_on_tfidf.model" % NUM_TOPICS)
SMALL_EN_ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_features=50000,
                num_concepts=NUM_TOPICS, lang='en')
SMALL_EN_ESA.save(SMALL_EN_ESA_PATH)

SMALL_EN_ESA = EsaModel.load(SMALL_EN_ESA_PATH)

# Regenerate mappings
generate_mappings()
Esempio n. 5
0
    def run(self):
        self.logger.info("Starting...")
        #corpus = CleanCorpus(options.doc_path)

        #save dictionary: word <-> token id map
        #corpus.dictionary.save(options.prefix + "_wordids.dict")
        #save(lambda path: corpus.dictionary.save(path), 
        #     options.prefix + "_wordids.dict")
        #corpus.dictionary.save_as_text(options.prefix + "_wordids.dict.txt")

        #del corpus

        """Bag-of-Words"""

        #init corpus reader and word -> id map
        #id2token = corpora.Dictionary.load(options.prefix + "_wordids.dict")
        #new_corpus = CleanCorpus(options.doc_path, dictionary = id2token)

        #create and save bow-representation of corpus
        #corpora.MmCorpus.serialize(options.prefix + '_bow_corpus.mm', new_corpus,
        #                         progress_cnt=10000)

        #save article names
        #new_corpus.save_article_names(options.prefix + "_articles.txt")

        #new_corpus.load_article_names(options.prefix + "_articles.txt")

        #del new_corpus

        #init corpus reader and word -> id map
        id2token = corpora.Dictionary.load(self.prefix + "_wordids.dict")
        #mm_bow = corpora.MmCorpus(options.prefix + '_bow_corpus.mm')

        """TFIDF Model creation"""

        #build tfidf model
        #tfidf = models.TfidfModel(mm_bow, id2word=id2token, normalize=True)

        #save tfidf model
        #tfidf.save(options.prefix + '_tfidf.model')

        #save corpus as tfidf vectors in matrix market format
        #corpora.MmCorpus.serialize(options.prefix + '_tfidf_corpus.mm', tfidf[mm_bow], 
        #                           progress_cnt=10000)

        #init tfidf-corpus reader
        mm_tfidf = corpora.MmCorpus(self.prefix + '_tfidf_corpus.mm')

        """LDA Model creation"""

        #build lda model
        lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token,
                              num_topics=NUM_TOPICS, update_every=1,
                              chunksize=10000, passes=2)

        #save trained model
        lda.save(self.prefix + '_lda.model')

        #save corpus as lda vectors in matrix market format
        corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf],
                                   progress_cnt=10000)

        #init lda-corpus reader
        mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm')

        """ESA Model creation"""

        #document titles
        article_titles = DocumentTitles.load(options.prefix + "_articles.txt")

        #build esa model
        esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles, num_features=NUM_TOPICS)

        esa.save(options.prefix + "_esa_on_lda.model")

        self.logger.info("finished transforming")