def test_constructor_with_big_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/wiki_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/wiki_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/wiki_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_cesa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_cesa.model') print tmp_esa
def test_constructor_with_file_wikicorpus(self): #load tf-idf model tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model") extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test") #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #Connect to mongo database connect(self.config_['database']['db-name'], username= self.config_['database']['user'], password= self.config_['database']['passwd'], port = self.config_['database']['port']) #Load articles as test corpus user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) s,f = X.shape logger.debug("Traning with %d samples, %d features, %d marks" % (s,f, len(y))) #train esa model esa_model = CosineEsaModel(tfidf_corpus, document_titles = document_titles, test_corpus = X, test_corpus_targets = y, num_test_corpus = len(y), num_best_features = 15, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/test_cesa.model') tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') print tmp_esa