Esempio n. 1
0
class LMDL_LDA():
    def __init__(self):
        self.lmdl = LMDL_Corpus()
        self.texts = self.lmdl.get_corpus_texts_words()
        self.dictionary = Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        self.lda = LdaModel(self.corpus,
                            num_topics=LDA_NUM_TOPICS,
                            id2word=self.dictionary)

    def print_topics(self):
        return self.lda.print_topics(LDA_NUM_TOPICS)

    def get_document_topics(self, document_name):
        document_tokens = self.lmdl.token_list_processed(document_name)
        topics = self.lda.get_document_topics(
            self.dictionary.doc2bow(document_tokens),
            minimum_probability=None,
            minimum_phi_value=None,
            per_word_topics=False)
        show_topics_list = []
        for topic in topics:
            lda_topic = self.lda.show_topic(topic[0], topn=10)
            show_topics_list.append(lda_topic)
        return show_topics_list

    def top_topics(self):
        return self.lda.top_topics(corpus=self.corpus,
                                   texts=self.texts,
                                   dictionary=self.dictionary,
                                   window_size=None,
                                   coherence='u_mass',
                                   topn=20,
                                   processes=-1)
Esempio n. 2
0
 def find_topic(self, num_topics, num_words=2, passes=20):
     dic = Dictionary(self.texts)
     corpus = [dic.doc2bow(text) for text in self.texts]
     lda = LdaModel(corpus,
                    num_topics=num_topics,
                    id2word=dic,
                    passes=passes)
     return lda.top_topics(topn=2, dictionary=dic, corpus=corpus)
Esempio n. 3
0
import sys

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import word2vec

data_file = sys.argv[1]
topic_num = int(sys.argv[2])
alpha = float(sys.argv[3])

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

lda = LdaModel(corpus=corpus,
               num_topics=topic_num,
               id2word=dic,
               alpha=alpha,
               random_state=1)

for t in lda.top_topics(corpus=corpus):
    print(f"coherence = {t[1]}, topic = {t[0]}")
Esempio n. 4
0
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)

    # Pickle the model for later use
    pickle.dump(
        lda_model,
        open(os.path.join('./results/lda_save_' + str(num_topics) + '.pk'),
             'wb'))

    print('The top 10 keywords in each topic')
    pprint(lda_model.print_topics(num_words=10))

    # Topic coherence https://rare-technologies.com/what-is-topic-coherence/
    top_topics = lda_model.top_topics(corpus)  # , num_words=20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    print('Top topics and their coherence:')
    pprint(top_topics)

    # Comparing LDA models
    # https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html

    # LDA Results Visual Analysis
    if visualize:
        #    pyLDAvis.enable_notebook()
        lda_res_path = os.path.join('./results/lda_pyldavis_' +
                                    str(num_topics))
        prepped_results = pyLDAvis.gensim.prepare(lda_model, corpus,
                                                  dictionary)