# Creates tokenized dataset that is seperated by poems, each word is checked # against the stop_words list and is lemmatized documents_without_stopwords=[] for poem in data: tmp=[] for line in poem: tmp.extend([lemmatizer.lemmatize(word) for word in line.split() if not(word in stop_words)]) documents_without_stopwords.append(tmp) id2word=Dictionary(documents_without_stopwords) corpus = [id2word.doc2bow(text) for text in documents_without_stopwords] topic_model=HdpModel(corpus=corpus,id2word=id2word) alpha,beta=topic_model.hdp_to_lda() #Trained a HDP model and used the num_topics, alpha and beta values as guidelines for a LDA model lda_topic_model=LdaModel(corpus=corpus,id2word=id2word,num_topics=14,alpha=alpha[:14],eta=beta[:14]) # Not sure why Topics don't make as much sense as I would have liked # pprint(lda_topic_model.print_topics()) # # #---------- Build The Model ---------# # # def makeModel(): model = Sequential() # How to decide the output dim of embedding? model.add(Embedding(total_words,500,input_length= input_len))
lda.fit(tf) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) print("processing dict and corpus") list_all_paragraphe_split = [sentence.split() for sentence in list_all_paragraphe_filtered] dictionary = Dictionary(list_all_paragraphe_split) corpus = [dictionary.doc2bow(text) for text in list_all_paragraphe_split] print("Topics in HDP model :") hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdptopics = hdpmodel.show_topics(formatted=False) print("there are "+str(len(hdptopics))+" topics in the data") alpha = hdpmodel.hdp_to_lda()[0] plt.figure() plt.plot(alpha) plt.show() # Fit the NMF model print("Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (len(list_all_paragraphe_filtered), n_features)) nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done with nmf ") print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidf_feature_names = tfidf_vectorizer.get_feature_names()
'coherence': coherence_lda}, index = topic_count_lda) topics_lda.head(10) # %% lines = topics_lda.plot.line(subplots = True) # %% [markdown] # ##### Gensim also includes Hierarchical Dirichlet process (HDP). HDP is a powerful mixed-membership model for # the unsupervised analysis of grouped data. Unlike its finite counterpart, latent Dirichlet allocation, # the HDP topic model infers the number of topics from the data. Here we have used Online HDP, # which provides the speed of online variational Bayes with the modeling flexibility of the HDP. # # See https://radimrehurek.com/gensim/models/hdpmodel.html # %% # Create a HDP model - default for hdp is 150 hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) # %% hdptopics = hdpmodel.show_topics(num_topics = 20, formatted=True) hdptopics # %% hdp_topics = hdpmodel.get_topics() hdp_topics.shape # %% hdpmodel.hdp_to_lda()