def lda(filename): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet" tweets = [] with open(filename, newline='') as csvfile: reader = csv.DictReader(csvfile) for tweet in reader: tweets.append(tweet['text'].split(' ')) dictionary = Dictionary(tweets) corpus = [dictionary.doc2bow(tweet) for tweet in tweets] for num_topics in [20, 30, 50]: lines = [] model = LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=num_topics, id2word=dictionary) with open('lda' + str(num_topics) + '.txt', 'w') as result: for topic in range(num_topics): for word in model.show_topic(topic, topn=20): result.write(word[0] + ' ') result.write('\n')
def format_topics_sentences_mallet(self, ldamodel: LdaMallet, corpus, texts): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num, topn=8) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series( [int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = [ 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords' ] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return (sent_topics_df)
def run_lda(self, processed_sentences): lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() # list containing the final topic keywords topic_top_words = [] documents = [ comment.split() for comment in processed_sentences if comment ] dictionary = corpora.Dictionary(documents) # Filter the words that occur in less than 5 comments or those that occur in more than half of the comments dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents] mallet_path = 'C:\\Mallet-2.0.8\\bin\\mallet' optimization_interval = 50 lda_alpha = 1 lda = LdaMallet(mallet_path, doc_term_matrix, num_topics=self.number_of_topics, id2word=dictionary, optimize_interval=optimization_interval, alpha=lda_alpha) # This list contains the word probabilities given a topic topic_words_and_probs = [] for i in range(self.number_of_topics): # Get top number_of_lda_keywords_for_assignment words and corresponding probabilities for the topic topic_words_and_probs.append( lda.show_topic( i, topn=self.number_of_lda_keywords_for_assignment)) for i in range(len(topic_words_and_probs)): temp = [] for j in topic_words_and_probs[i]: if j[1] > 0.0: temp.append(j) self.total_topic_word.append(j[0]) topic_words_and_probs[i] = temp for i in range(self.number_of_topics): # Get the top keywords for the topic and extract the top nouns topic_words = [ component[0] for component in topic_words_and_probs[i] ] final_topic_words = [] for word in topic_words: if len(final_topic_words) >= self.number_of_lda_keywords: break pos = nltk.pos_tag([word]) word = lemmatizer.lemmatize(word) noun_tags = ['NN', 'NNS', 'NP', 'NPS'] if word not in final_topic_words and pos[0][1] in noun_tags: final_topic_words.append(word) topic_top_words.append(final_topic_words) return topic_top_words, topic_words_and_probs
def get_topics(num, corpus, id2word, output_dir, all_sentences): print(num) ldamallet = LdaMallet(args.mallet_dir, corpus=corpus, num_topics=num, prefix=output_dir + "/" + str(num), workers=4, id2word=id2word, iterations=1000, random_seed=42) coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=all_sentences, dictionary=id2word, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet) keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)} with open(output_dir + "/" + str(num) + '_words.json', 'w') as f: f.write(json.dumps(keywords)) ldamallet.save(output_dir + "/" + str(num)) #ldamallet.show_topics(num_topics=num, formatted=True) return coherence_ldamallet