def generate_topics():
    db, cursor = dbConnect()
    for domain in c.domains:
        start_time = time.time()
        papers, tf, feature_names = load_corpus(domain, db)
        #lda,feature_names=load_model(domain,c.domain_topics[domain])
        lda = LatentDirichletAllocation(n_topics=c.domain_topics[domain],
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        lda.fit(tf)
        #---------- MODEL EVALUATION PARAMETERS --------------------------
        perplexity1 = lda.perplexity(tf)
        perplexity2 = lda.perplexity(tf, lda._e_step(tf, False, False)[0])
        score = lda.score(tf, lda._e_step(tf, False, False)[0])
        topic_paper_dist = lda.transform(tf)
        print "for", c.domain_topics[
            domain], domain, "topics ==> perplexity:", perplexity2, "log likelihood:", score

        save_model(lda, domain, c.domain_topics[domain], feature_names)
        #lda,feature_names=load_model(domain,c.domain_topics[domain])
        store_in_db(db, lda, topic_paper_dist, papers, feature_names, domain)
        print "--- time for " + domain + ": " + str(
            (time.time() - start_time) / 60) + " minutes ---"
Beispiel #2
0
    termfreq_vectorizer = CountVectorizer()
    texts_vectored = termfreq_vectorizer.fit_transform(texts)

    log.info("learning lda model")
    NORMALIZED = True
    lda_model = LatentDirichletAllocation(n_topics=n_topics,
                                          learning_method='batch',
                                          evaluate_every=10,
                                          n_jobs=2,
                                          verbose=10,
                                          doc_topic_prior=None,
                                          topic_word_prior=None)
    if NORMALIZED:
        log.debug("fitting normalized")
        content_lda = lda_model.fit_transform(texts_vectored)
    else:
        log.debug("fitting ")
        lda_model.fit(texts_vectored)
        content_lda, _ = lda_model._e_step(texts_vectored, cal_sstats=False, random_init=False)
    log.debug("components_ shape: {}".format(lda_model.components_.shape))
    log.debug("content_lda shape: {}".format(content_lda.shape))

    dump_course_topic_distribs(course_ids, file_names, content_lda,
                               'lda_course_topic_distribs_{}.tsv'.format(n_topics))

    dump_topic_word_distribs(lda_model, termfreq_vectorizer,
                             'lda_topic_word_distribs_{}.tsv'.format(n_topics),
                             threshold=0.25)