def generate_topics(): db, cursor = dbConnect() for domain in c.domains: start_time = time.time() papers, tf, feature_names = load_corpus(domain, db) #lda,feature_names=load_model(domain,c.domain_topics[domain]) lda = LatentDirichletAllocation(n_topics=c.domain_topics[domain], max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) #---------- MODEL EVALUATION PARAMETERS -------------------------- perplexity1 = lda.perplexity(tf) perplexity2 = lda.perplexity(tf, lda._e_step(tf, False, False)[0]) score = lda.score(tf, lda._e_step(tf, False, False)[0]) topic_paper_dist = lda.transform(tf) print "for", c.domain_topics[ domain], domain, "topics ==> perplexity:", perplexity2, "log likelihood:", score save_model(lda, domain, c.domain_topics[domain], feature_names) #lda,feature_names=load_model(domain,c.domain_topics[domain]) store_in_db(db, lda, topic_paper_dist, papers, feature_names, domain) print "--- time for " + domain + ": " + str( (time.time() - start_time) / 60) + " minutes ---"
termfreq_vectorizer = CountVectorizer() texts_vectored = termfreq_vectorizer.fit_transform(texts) log.info("learning lda model") NORMALIZED = True lda_model = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', evaluate_every=10, n_jobs=2, verbose=10, doc_topic_prior=None, topic_word_prior=None) if NORMALIZED: log.debug("fitting normalized") content_lda = lda_model.fit_transform(texts_vectored) else: log.debug("fitting ") lda_model.fit(texts_vectored) content_lda, _ = lda_model._e_step(texts_vectored, cal_sstats=False, random_init=False) log.debug("components_ shape: {}".format(lda_model.components_.shape)) log.debug("content_lda shape: {}".format(content_lda.shape)) dump_course_topic_distribs(course_ids, file_names, content_lda, 'lda_course_topic_distribs_{}.tsv'.format(n_topics)) dump_topic_word_distribs(lda_model, termfreq_vectorizer, 'lda_topic_word_distribs_{}.tsv'.format(n_topics), threshold=0.25)