class LDARecommender(Recommender): def __init__(self): return def preprocess(self, text): return preprocessing.cleanTokens(text) def train(self, train_filename): print("train LDA") train_name = os.path.basename(train_filename) model_filename = train_name + ".lda_model" if os.path.isfile(model_filename): self.model = LdaMallet.load(model_filename) else: self.corpus = preprocessing.GensimCorpus(train_filename) self.model = LdaMallet(mallet_path, self.corpus, num_topics=100, id2word=self.corpus.dictionary) self.model.save(model_filename) topics_str = self.model.show_topics(num_topics=-1) open(train_name + ".lda_model.topics", 'w').write(str(topics_str)) def recommend(self, input_text): input_bow = self.corpus.dictionary.doc2bow(self.preprocess(input_text)) input_topics = self.model[input_bow] print("lda topics: " + str(input_topics)) return input_text
def gensim_mallet_lda(self, num_topics=5, num_words=15): """Performs Mallet LDA using Gensim wrapper. Requires gensim_corpus output for a column from gensim_preprocessing(). Args: num_topics (int): Desired number of topics to model. num_words (int): Number of words to print for each topic. """ mallet_lda_model = LdaMallet(self.mallet_path, corpus=self.gensim_corpus, num_topics=num_topics, id2word=self.id2word) label = self.data_frame.columns.to_numpy()[self.col_num] print(f"Column {self.col_num} - Label: {label}\n") print(f"MALLET LDA Topic Modeling via Gensim with {num_topics} topics:\n") # Print topics and words x = mallet_lda_model.show_topics(num_topics=num_topics, num_words=num_words, log=False, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] for topic, words in topics_words: print(f"Topic {str(topic)}:\n{str(words)}\n") coherence = self.coherence_score(mallet_lda_model, self.gensim_words_nostops, self.id2word) print(f"Coherence: {coherence}")
def make_mallet_model(main_df, d_path, stop, field, ntopics): mallet_path = 'mallet/bin/mallet' main_df_notnull = main_df[main_df['abstract'].str.strip() != 'nan.'].copy() main_df_notnull = main_df_notnull[main_df_notnull['abstract_length'] > 20] main_df_notnull = main_df_notnull[main_df_notnull['Title'].notnull()] token_vectorizer = CountVectorizer( tokenizer=reflection_tokenizer, #max_df=500, min_df=2, stop_words=stop, ngram_range=(1, 3)) token_vectorizer.fit(main_df_notnull[field]) doc_word = token_vectorizer.transform(main_df_notnull[field]).transpose() corpus = matutils.Sparse2Corpus(doc_word) word2id = dict((v, k) for v, k in token_vectorizer.vocabulary_.items()) id2word = dict((v, k) for k, v in token_vectorizer.vocabulary_.items()) dictionary = corpora.Dictionary() dictionary.id2token = id2word dictionary.token2id = word2id texts = main_df_notnull[field].apply(lambda x: x.split()).to_list() ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=ntopics, id2word=id2word, random_seed=77) mallet_topics = pd.DataFrame( index=list(id2word.values()), columns=['Topic ' + str(x) for x in range(1, ntopics + 1)]) print( ldamallet.show_topics(num_topics=ntopics, num_words=10, formatted=True)) for topic in ldamallet.show_topics(num_topics=ntopics, num_words=len(id2word), formatted=False): for tupler in topic[1]: mallet_topics.loc[tupler[0], 'Topic ' + str(topic[0] + 1)] = tupler[1] mallet_topics.to_csv(os.path.join(d_path, 'models', 'mallet_topic_df.csv')) return mallet_topics
def model_mallet(clean_doc, dictionary, doc_term_matrix): lda_mallet = LdaMallet(mallet_path, corpus=doc_term_matrix, id2word=dictionary, num_topics=25, workers=3) print("Topics generated with the mallet LDA model are:\n") pprint(lda_mallet.show_topics(formatted=False)) print("----------------------------------------------------") coherence_model_mallet = CoherenceModel(model=lda_mallet, texts=clean_doc, dictionary=dictionary, coherence='c_v') coherence_mallet = coherence_model_mallet.get_coherence() print(f"coherence score: {coherence_mallet}") mallet_2 = ldamallet.malletmodel2ldamodel(lda_mallet) return mallet_2
def main(): num_topics = 10 #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt' MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin", "mallet.bat") # r"D:\Mallet\mallet-2.0.8\bin" texts = wenzhang_Lemmatizer1.texts2 dictionary = corpora.Dictionary(texts) dictionary.save('dictionary_mallet_10_3.dictionary') #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary') word_id = dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus) # 保存corpus # corpus = corpora.MmCorpus('corpus_wenzhang.mm') # 加载 # print(os.path.abspath('corpus.mm')) mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=dictionary) mallet_lda_model.save( 'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model') #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model') topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics, num_words=20) # print(topic_words20) writetopic_wordToExcleFile( topic_words20, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls' ) topic_words = mallet_lda_model.get_topics() print(len(topic_words), len(topic_words[0])) doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics()) #doc_topics_path #print(mallet_lda_model.fdoctopics()) writedoc_topicToExcleFile( doc_topics, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3' ) return texts, word_id, topic_words, doc_topics, num_topics
f = open("discursos_all.txt", "r") discursos_file = f.read() f.close() res = eval(discursos_file) elapsed_time = time.time() - start_time print(time.strftime("Discursos importados, demorou %H:%M:%S:%m", time.gmtime(elapsed_time))) start_time = time.time() data = [a.split() for a in res] dictionary = Dictionary(data) corpus = [dictionary.doc2bow(t) for t in data] mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat' lda = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=500) elapsed_time = time.time() - start_time print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time))) with open("topics_500_latest.txt", 'w+') as f: for index, topic in lda.show_topics(formatted=False, num_words=15): f.write('[{}] - '.format(index)) f.write(', '.join(str(line[0]) for line in topic)) f.write('\n')
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary) print('LDA Model trained') try: ldamallet.save('ldamallet_mag.model') except OverflowError: print("Trying to pickle model using protocol 4") with open('ldamallet_mag.model', 'wb') as pick: pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL) print("Lda model saved to disk") # Show Topics pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel( model=ldamallet, texts=data_stemmed, dictionary=id2word_dictionary, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet) # Memory-friendly # Create generator # docstream is a generator which will be passed to Dictionary to create a Gensim dictionary # docstream = (tokens for tokens in stream_from_file(filename))
- 8 topics, ~ local optimum - 30 topic, ~ global optimum """ # model with 8 topics # --+ estimate model lda_8 = LdaMallet( mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123 ) # --+ print topics (20 words per topic) lda_8.print_topics(num_topics=8, num_words=20) # --+ translate topic modeling outcome lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8) # --+ term-to-topic probabilities (10 words per topic) top_terms_line = lda_8.show_topics(num_topics=8, num_words=10) # ----+ rearrange data on top 10 terms per topic top_terms_m = [] for i in top_terms_line: topic_num = i[0] prob_terms = i[1].split("+") for term_sort, term in enumerate(prob_terms): weight = float(term.split("*")[0]) term = term.split("*")[1].strip('"| ') top_terms_m.append([topic_num, term_sort, weight, term]) df = pd.DataFrame(top_terms_m) # ----+ rename columns old_names = [0, 1, 2, 3] new_names = ["topic_n", "term_sort", "weight", "term"] cols = dict(zip(old_names, new_names)) df.rename(columns=cols, inplace=True)
count_vectorizer.fit(docs) doc_word = count_vectorizer.transform(docs).transpose() corpus = matutils.Sparse2Corpus(doc_word) # vocab creation word2id = dict((v, k) for v, k in count_vectorizer.vocabulary_.items()) id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items()) dictionary = corpora.Dictionary() dictionary.id2token = id2word dictionary.token2id = word2id # topic modeling ldamallet = LdaMallet(MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=id2word, iterations=400) # save topic model to file topic_file = open("english_topics_{}.pkl".format(sys.argv[1]), "wb") pickle.dump(ldamallet.show_topics(formatted=False, num_topics=num_topics), topic_file) topic_file.close() # get NPMI coherence coherence = CoherenceModel(model=ldamallet, texts=texts, dictionary=dictionary, coherence='c_npmi') print("coherence:", coherence.get_coherence())
lda_model = LdaModel(corpus=corpus, id2word=dictionary, alpha=0.1, eta='auto', iterations=args.num_iterations, num_topics=args.num_topics) total1 = time.time() - start1 with open(args.output + "_lda" + ".times", 'w') as out: out.write('time: %f s' % float(total1)) topics1 = lda_model.show_topics(num_topics=args.num_topics, num_words=50, log=True, formatted=False) report_topics(args.output + "_lda", topics1, limit=50) start2 = time.time() lda_mallet_model = LdaMallet('./Mallet/bin/mallet', corpus=corpus, id2word=dictionary, alpha=0.1, iterations=args.num_iterations, num_topics=args.num_topics) total2 = time.time() - start2 with open(args.output + "_mallet" + ".times", 'w') as out: out.write('time: %f s' % float(total2)) topics2 = lda_mallet_model.show_topics(num_topics=args.num_topics, num_words=50, log=True, formatted=False) report_topics(args.output + "_mallet", topics2, limit=50)
def model_topics(era, n_topics=8, n_iterations=2500): """Conducts topic modeling with supplied parameters. Relies on a MALLET binary in the src directory. Note: this binary is not included in the GitHub repository due to storage restrictions. Saves topic modeling numerical results to CSV file and topic words to text file. Parameters ---------- era : str Century to limit corpus to: '19th' or '20th' n_topics : int, optional Number of topics to assume for modeling, by default 8 n_iterations : int, optional Number of iterations to run LDA algorithm, by default 2500 """ parent_dir = Path(__file__).parents[1] seed = 1921 dictionary, bow_corpus, IDs = pickle.load(open(parent_dir / 'data/corpus.pickle', 'rb')) # cast posix path to string for gensim connection to mallet path_to_mallet_binary = str(parent_dir / 'src//mallet-2.0.8/bin/mallet') model = LdaMallet(path_to_mallet_binary, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, iterations=n_iterations, random_seed=seed ) topics_table = {} docs = list(model.load_document_topics()) for i in tqdm(range(len(IDs)), desc='Reading results into dataframe'): doc = docs[i] topic_percentages = [t[1] for t in doc] topics_table[IDs[i]] = topic_percentages reference = pd.read_csv(parent_dir / 'data/reference.csv') title_dict = OrderedDict() for ID in tqdm(reference['ID'], desc='Getting titles from ID'): title_dict[ID] = reference[reference['ID'] == ID]['title'].values[0] year_dict = OrderedDict() for ID in tqdm(reference['ID'], desc='Getting years from ID'): year_dict[ID] = reference[reference['ID'] == ID]['date'].values[0] column_names = [f'topic_{i}' for i in range(0, n_topics)] results = pd.DataFrame.from_dict(topics_table, orient='index', columns=column_names) results.index.name = "ID" results.insert(0, "title", title_dict.values()) # add titles results.insert(1, "ID", IDs) # add ids results.insert(2, "year", year_dict.values()) # add years # save results to file results.to_csv(parent_dir / 'data' / f'{era}_topics.csv', index=False) # save topic words to text file topics = model.show_topics(num_topics=-1) with open(parent_dir / 'data' / f'{era}_topics.txt', 'a') as output: output.writelines(str(line)+'\n' for line in topics)
''' # model with 9 topics # --+ estimate model lda_9 = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=9, random_seed=123) # --+ print topics (20 words per topic) lda_9.print_topics(num_topics=9, num_words=20) # --+ translate topic modeling outcome lda_9 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_9) # --+ term-to-topic probabilities (10 words per topic) top_terms_line = lda_9.show_topics(num_topics=9, num_words=10) # ----+ rearrange data on top 10 terms per topic top_terms_m = [] for i in top_terms_line: topic_num = i[0] prob_terms = i[1].split('+') for term_sort, term in enumerate(prob_terms): weight = float(term.split('*')[0]) term = term.split('*')[1].strip('"| ') top_terms_m.append([topic_num, term_sort, weight, term]) df = pd.DataFrame(top_terms_m) # ----+ rename columns old_names = [0, 1, 2, 3] new_names = ['topic_n', 'term_sort', 'weight', 'term'] cols = dict(zip(old_names, new_names))
output_path = 'd:/code/gc_text_analysis/mallet_output/' num_topics = 140 model = LdaMallet(path_to_mallet_binary, corpus=bow_docs, workers=4, iterations=2000, num_topics=num_topics, id2word=dictionary, prefix=output_path) model.save('gc_lda_model.pkl') dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items()) words_freq = [(dictionary.id2token[id], cnt) for id, cnt in dictionary.dfs.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) words_freq = pd.DataFrame(words_freq, columns=['word', 'count']) coherence_model_lda = CoherenceModel(model=model, texts=ngram_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() topics = model.show_topics(num_topics=num_topics, num_words=10, log=False, formatted=False) topics = list(zip(*topics))[1] gc_topics = model[bow_docs[-73:]]