def train(data, mallet_path: str, random_seed: int, num_topics: int, *args,
              **kwargs):
        logger.debug(f"start training, args:{args}, kwargs:{kwargs}")

        # Create Dictionary
        id2word = corpora.Dictionary(data)

        # Create Corpus
        texts = data

        # Term Document Frequency
        corpus = [id2word.doc2bow(text1) for text1 in texts]
        lda_mallet_model = gensim.models.wrappers.LdaMallet(
            mallet_path,
            corpus=corpus,
            num_topics=num_topics,
            id2word=id2word,
            random_seed=random_seed)
        # mallet models need to first be converted to gensim models
        gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_mallet_model)
        pyldaVis_prepared_model = prepare(gensim_model,
                                          corpus,
                                          id2word,
                                          n_jobs=1)
        return pyldaVis_prepared_model
Example #2
0
def visualize_lda_model():
    data = preprocess_to_lemmatization()
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see',
        'want', 'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also',
        'copyright', 'something'
    ]
    my_stopwords = stopwords.words(
        'english') + stopwords_verbs + stopwords_other
    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])
    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
    pyLDAvis.enable_notebook()
    return pyLDAvis.display(lda_viz)
Example #3
0
def test_hdp():
    """Trains a HDP model and tests the html outputs."""
    corpus, dictionary = get_corpus_dictionary()
    hdp = HdpModel(corpus, dictionary.id2token)

    data = gensim_models.prepare(hdp, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_hdp.html')
    os.remove('index_hdp.html')
Example #4
0
def test_lda():
    """Trains a LDA model and tests the html outputs."""
    corpus, dictionary = get_corpus_dictionary()
    lda = LdaModel(corpus=corpus, num_topics=2)

    data = gensim_models.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_lda.html')
    os.remove('index_lda.html')
Example #5
0
def lda_model(data, corpus, dictionary, num_topics):
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

    # Visualize topics with pyLDAvis
    lda_data = gensimvis.prepare(lda, corpus, dictionary)
    html_string = pyLDAvis.prepared_data_to_html(lda_data)
    components.v1.html(html_string, width=1280, height=1024)

    # Visualize documents w/ t-SNE
    visualize_topics(data, corpus, lda, num_topics)
Example #6
0
def lda_report(lda_model, corpus, data_lemmatized, id2word):
    """
    This function reports Perplexity, Coherence Score and also visualize the topics.
    :param lda_model:
    :type lda_model:
    :param corpus:
    :type corpus:
    :param data_lemmatized:
    :type data_lemmatized:
    :param id2word:
    :type id2word:
    """
    with open(r'../results/output/lda/Best_lda_model/lda_bm_n15.txt',
              'w') as f:
        with redirect_stdout(f):
            print(lda_model.print_topics())

    # Compute Perplexity
    with open(r'../results/output/lda/Best_lda_model/lda_output_bm_n15.txt',
              'w') as f:
        with redirect_stdout(f):
            print('\nPerplexity: ', lda_model.log_perplexity(corpus)
                  )  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_lemmatized,
                                         dictionary=id2word,
                                         coherence='c_v')
    # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word,
    #                                      coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    with open(r'../results/output/lda/Best_lda_model/lda_output_bm_n15.txt',
              'a') as f:
        with redirect_stdout(f):
            print('\nCoherence Score: ', coherence_lda)

    # Visualize the topics
    # vis = gensimvis.prepare(lda_model, corpus, id2word)
    LDAvis_data_filepath = os.path.join(
        r'../results/output/lda/Best_lda_model/ldavis_prepared_n15')

    if 1 == 1:
        LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(
        LDAvis_prepared,
        r'../results/output/lda/Best_lda_model/ldavis_n15.html')
Example #7
0
def test_sorted_terms():
    """This tests that we can get the terms of a given topic using lambda
    to calculate the relevance ranking. A common workflow is that once we
    visualize the topics we modify the lambda slide and we are interested
    in a particular lambda value, then with this function we can get the
    terms in that order.
    """
    corpus, dictionary = get_corpus_dictionary()
    lda = LdaModel(corpus=corpus, num_topics=2)

    data = gensim_models.prepare(lda, corpus, dictionary)
    # https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
    # lambda = 0 should rank the terms by loglift
    # lambda = 1 should rank them by logprob.
    sorted_terms = data.sorted_terms(topic=1, _lambda=1).to_dict()
    assert (sorted_terms['logprob'] == sorted_terms['relevance'])
    sorted_terms = data.sorted_terms(topic=1, _lambda=0).to_dict()
    assert (sorted_terms['loglift'] == sorted_terms['relevance'])
    def train(data, *args, **kwargs):
        logger.debug(f"start training, args:{args}, kwargs:{kwargs}")

        # Create Dictionary
        id2word = corpora.Dictionary(data)

        # Create Corpus
        texts = data

        # Term Document Frequency
        corpus = [id2word.doc2bow(text1) for text1 in texts]

        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=kwargs["num_topics"],
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

        pyldaVis_prepared_model = prepare(lda_model, corpus, id2word, n_jobs=1)
        return pyldaVis_prepared_model
Example #9
0
# plt.ylabel("Topic proportion")
plt.xlabel("")
plt.savefig("img/nmfPropStacked.png")
plt.clf()

exit()

# Stuff in the following is not used anymore

#################################################################
# LDAvis
#################################################################

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

ldaDisp = gensimvis.prepare(lda, corpus, dct, sort_topics=False)
pyLDAvis.save_html(ldaDisp, "ldavistest.html")
os.startfile(".\ldavistest.html")

if False:
    for topic in range(0, NUM_TOPICS):
        termslda = lda.show_topic(
            topic, topn=50
        )  # get_topic_terms would return words as dict IDs, not strings
        # Model returns list of tuples, wordcloud wants a dictionary instead
        wordcloudlda = WordCloud(
            background_color="white").generate_from_frequencies(dict(termslda))
        plt.subplot(3, 3, topic + 1)
        plt.imshow(wordcloudlda)
        plt.axis("off")
        plt.title("Topic" + str(topic + 1))
# ## LDA

# In[133]:


lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5)


# In[134]:


lda.lda_topics(lda_allbow)


# In[135]:


import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()


# In[137]:


lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5)
vis = gensimvis.prepare(lda_allbow, bow_corpus, dictionary)
vis

Example #11
0
def hdp_model(corpus, dictionary):
    hdp = models.HdpModel(corpus, id2word=dictionary)
    hdp_data = gensimvis.prepare(hdp, corpus, dictionary)
    html_string = pyLDAvis.prepared_data_to_html(hdp_data)
    components.v1.html(html_string, width=1280, height=1024)
chunksize = 4000 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

# %time model = gensim.models.LdaModel(corpus=gensim_corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

# pyLDAvis.gensim.prepare(model, gensim_corpus, dictionary)
gensimvis.prepare(model, gensim_corpus, dictionary)

model.show_topic(3)

df['topics'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][0])
df['score'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][1])
df['topic_name'] = df['Review_tokenized'].apply(lambda x: model.show_topic(model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][0]))
df['topic_name2'] = df['topic_name'].apply(lambda x:convert_to_topic(x))

df[['Restaurant', 'Review', 'topics','topic_name2','score']]

"""Interpretation

พบว่า Topic Name สามารถแบ่งย่อยได้ 5 เรื่อง 

Topic 0 พูดถึง ดี น้ำ ราคา เนื้อ อาหาร ทาน หวาน อร่อย กิน คุ้ม
Example #13
0
"""## Clear visualization of Lda model with number of topics taken as 12"""

import pyLDAvis
import os
import pickle
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
num_topics = 12
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/content/sample_data/ldavis_prepared_' +
                                    str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(
    LDAvis_prepared,
    '/content/sample_data/ldavis_prepared_' + str(num_topics) + '.html')
LDAvis_prepared
"""The 12 clusters mainly talk about the battery/heating/charging issues and phone's camera quality issues

# coherence score (c_v metric) for number of topics = 12
"""

# Compute Coherence Score using c_v
Example #14
0
#######################################
#
#  20190207 Simple LDA demo build
#
#  Step 04 - visualise the LDA model
#
#######################################

ldamodel.show_topics()

#or a formatted version
pd.set_option('max_colwidth', 700)
num_words = 15
topic_list = ldamodel.show_topics(num_words=num_words, formatted=True)
df = pd.DataFrame(topic_list)
df

warnings.simplefilter('ignore')

import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

vis_data = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=False)

pyLDAvis.display(vis_data)

#######################################
Example #15
0
# %% [code] {"jupyter":{"outputs_hidden":false}}
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 4, id2word = dic, passes = 10, workers = 2)
lda_model.show_topics()

# %% [markdown]
# ### VISUALIZING RESULTS OF LDA

# %% [code] {"jupyter":{"outputs_hidden":false}}
import pyLDAvis
from pyLDAvis import gensim_models

# %% [code] {"jupyter":{"outputs_hidden":false}}
# fig = plt.figure(figsize = (20, 20))
pyLDAvis.enable_notebook()
vis = gensim_models.prepare(lda_model, bow_corpus, dic)
vis

# %% [code] {"jupyter":{"outputs_hidden":false}}
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)