Esempio n. 1
0
# Creates tokenized dataset that is seperated by poems, each word is checked
# against the stop_words list and is lemmatized
documents_without_stopwords=[]
for poem in data:
    tmp=[]
    for line in poem:
        tmp.extend([lemmatizer.lemmatize(word) for word in line.split() if not(word in stop_words)])
    documents_without_stopwords.append(tmp)

id2word=Dictionary(documents_without_stopwords)
corpus = [id2word.doc2bow(text) for text in documents_without_stopwords]


topic_model=HdpModel(corpus=corpus,id2word=id2word)
alpha,beta=topic_model.hdp_to_lda()
#Trained a HDP model and used the num_topics, alpha and beta values as guidelines for a LDA model
lda_topic_model=LdaModel(corpus=corpus,id2word=id2word,num_topics=14,alpha=alpha[:14],eta=beta[:14])
# Not sure why Topics don't make as much sense as I would have liked
# pprint(lda_topic_model.print_topics())



#                                    #
#---------- Build The Model ---------#
#                                    #

def makeModel():
    model = Sequential()
    # How to decide the output dim of embedding?
    model.add(Embedding(total_words,500,input_length= input_len))
lda.fit(tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

print("processing dict and corpus")
list_all_paragraphe_split = [sentence.split()
                             for sentence in list_all_paragraphe_filtered]
dictionary = Dictionary(list_all_paragraphe_split)
corpus = [dictionary.doc2bow(text) for text in list_all_paragraphe_split]

print("Topics in HDP model :")
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdptopics = hdpmodel.show_topics(formatted=False)
print("there are "+str(len(hdptopics))+" topics in the data")
alpha = hdpmodel.hdp_to_lda()[0]
plt.figure()
plt.plot(alpha)
plt.show()

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (len(list_all_paragraphe_filtered), n_features))
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done with nmf ")

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
                           'coherence': coherence_lda},
                         index = topic_count_lda)

topics_lda.head(10)

# %%
lines = topics_lda.plot.line(subplots = True)

# %% [markdown]
# ##### Gensim also includes Hierarchical Dirichlet process (HDP). HDP is a powerful mixed-membership model for 
# the unsupervised analysis of grouped data. Unlike its finite counterpart, latent Dirichlet allocation, 
# the HDP topic model infers the number of topics from the data. Here we have used Online HDP, 
# which provides the speed of online variational Bayes with the modeling flexibility of the HDP.
#
# See https://radimrehurek.com/gensim/models/hdpmodel.html

# %%
# Create a HDP model - default for hdp is 150
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

# %%
hdptopics = hdpmodel.show_topics(num_topics = 20, formatted=True)
hdptopics

# %%
hdp_topics = hdpmodel.get_topics()
hdp_topics.shape

# %%
hdpmodel.hdp_to_lda()