def get_lda(text_dictionary): train = [] for key, line in text_dictionary.items(): line = line.strip().split(' ') train.append(line) print(len(train)) print(' '.join(train[2])) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) topic_list = lda.print_topics(20) print(type(lda.print_topics(20))) print(len(lda.print_topics(20))) for topic in topic_list: print(topic) print("第一主题") print(lda.print_topic(1)) print('给定一个新文档,输出其主题分布') # test_doc = list(new_doc) #新文档进行分词 test_doc = train[2] # 查看训练集中第三个样本的主题分布 doc_bow = dictionary.doc2bow(test_doc) # 文档转换成bow doc_lda = lda[doc_bow] # 得到新文档的主题分布 # 输出新文档的主题分布 print(doc_lda) for topic in doc_lda: print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
def save_model(): """ 保存LDA模型 :param model_path: :return: ----------------- corpus:[ [('词ID', 词频),('词ID', 词频)...], [('词ID', 词频),('词ID', 词频)...], ....... ] 稀疏向量集 id2word: {'词1':0, '词2':1. ..} """ train_set = get_train_set() word_dict = Dictionary(train_set) # 生成文档的词典,每个词与一个整型索引值对应 corpus_list = [word_dict.doc2bow(text) for text in train_set] # 词频统计,转化成空间向量格式 lda = LdaModel( corpus=corpus_list, id2word=word_dict, num_topics=100, # passes=5, # epoch alpha='auto') lda.print_topic(99) # 保存LDA 模型 lda.save(lda_model_path)
def testLDA(file): file = open(file, encoding='UTF-8') train = [] try: lines = file.readlines() stopwords = open("file\\stopwords_cn.txt", encoding='UTF-8').readlines() stopwords = [w.strip() for w in stopwords] for line in lines: line = line.split() train.append([w for w in line if w not in stopwords]) finally: file.close() dict = corpora.Dictionary(train) #自建词典 # 通过dict将用字符串表示的文档转换为用id表示的文档向量 corpus = [dict.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dict, num_topics=20) lda.print_topic(2) lda.save('file\\lda.model')
def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [ w.strip() for w in stopwords ] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([ w for w in line if w not in stopwords ]) dictionary = corpora.Dictionary(train) corpus = [ dictionary.doc2bow(text) for text in train ] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [w.strip() for w in stopwords] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([w for w in line if w not in stopwords]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
def create_gensim_lda_model(doc_clean,number_of_topics): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ dictionary,doc_term_matrix=prepare_corpus(doc_clean) # generate LDA model ldamodel = LdaModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary, alpha="auto", eval_every=5) # train model #print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words)) #ldamodel.print_topics(-1) for i in range(0, ldamodel.num_topics-1): print(ldamodel.print_topic(i)) return ldamodel, dictionary, doc_term_matrix
def extract_topics(words): word_id_map=Dictionary([words]) word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2]) word_id_map.compactify() deals_corpus=[word_id_map.doc2bow(words)] lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1) topics=[] for i in range(15): tokens=lda.print_topic(i).split('+') topic_scores=[] for token in tokens: score,token_val=token.split('*') topic_scores.append((token_val,score)) topics.append(topic_scores) return topics
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
class LDA_result(object): def __init__(self, abtract_complete_true,num_topics = 4, chunksize = 1000, passes = 60, iterations = 600, eval_every = None): self.num_journal = len(abtract_complete_true) self.abtract_complete_true = abtract_complete_true self.abtract_complete = self.abtract_complete_combination() self.dictionary = corpora.Dictionary(self.abtract_complete) self.corpus = [self.dictionary.doc2bow(text) for text in self.abtract_complete] self.temp = self.dictionary[0] self.id2word = self.dictionary.id2token self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.iterations = iterations self.eval_every = eval_every self.model = LdaModel(corpus = self.corpus, id2word = self.id2word, chunksize = self.chunksize, \ alpha='auto', eta='auto', \ iterations = self.iterations, num_topics = self.num_topics, \ passes = self.passes, eval_every = self.eval_every) #建立模型 #组合摘要词汇 def abtract_complete_combination(self): abtract_complete = [] for journal_word_list in self.abtract_complete_true.values(): abtract_complete.append(journal_word_list) return abtract_complete ##描述情况 def description(self): print('Number of unique tokens: %d' % len(self.dictionary)) print('Number of documents: %d' % len(self.corpus)) #转tfidf向量 def word2tfidf(self): tfidf = models.TfidfModel(self.corpus) corpusTfidf = tfidf[self.corpus] return corpusTfidf #输出各个主题关键词 def key_words(self): top_topics = self.model.top_topics(self.corpus) pprint(top_topics) #每一行包含了主题词和主题词的权重 def key_weight(self): print(self.model.print_topic(0,10)) print(self.model.print_topic(1,10)) #判断第一个训练集文档属于哪个主题,没什么卵用凑个数 def topic_belong(self): for index, score in sorted(self.model[self.corpus[0]], key=lambda tup: -1*tup[1]): print("Score: {}\n Topic: {}".format(score, self.model.print_topic(index, 10))) #LDA进行可视化 def visible(self): vis_wrapper = pyLDAvis.gensim.prepare(self.model,self.corpus,self.dictionary) pyLDAvis.display(vis_wrapper) pyLDAvis.save_html(vis_wrapper,"lda%dtopics.html"%self.num_topics) pyLDAvis.show(vis_wrapper) #给训练集输出其属于不同主题概率 def community_belong(self): journal_community = {} for i,element in enumerate(abtract_complete_true): journal_community[element] = [] for index, score in sorted(self.model[self.corpus[i]], key=lambda tup: -1*tup[1]): if score > 0.2: journal_community[element].append(str(index)) print(index, score) return journal_community #给定新的语料 # @staticmethod # def word_corpus(abtract_complete): # dictionary = corpora.Dictionary(abtract_complete) # corpus = [dictionary.doc2bow(text) for text in abtract_complete] # return corpus #判断新预料的主题归属 def identify_community(self, abtract_complete): corpus = self.dictionary.doc2bow(abtract_complete) community = [] for index, score in sorted(self.model[corpus], key=lambda tup: -1*tup[1]): if score > 0.2: community.append(str(index)) return community
model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) top_topics = model.top_topics(corpus, 5) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) from pprint import pprint pprint(top_topics) model.print_topic(1, 30) model.print_topic(3, 30) #判断一个训练集文档属于哪个主题 for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]): print("Score: {}\t Topic: {}".format(score, model.print_topic(index, 10))) #给训练集输出其属于不同主题概率 for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]): print(index, score) #判断一个测试集文档属于哪个主题 #unseen_document = [" ".join(text_i) for text_i in clean_text4[130]] #unseen_document = " ".join(unseen_document) unseen_document = text[130]
titles = train_data['title'].tolist() # 1 加载语料 with open('../data/train_docs.pkl', 'rb') as in_data: train_docs = pickle.load(in_data) train_docs = [[word for word in doc.split(' ')] for doc in train_docs] dictionary = corpora.Dictionary(train_docs) corpus = [dictionary.doc2bow(text) for text in train_docs] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6) topic_list = lda.print_topics(20) # print(type(lda.print_topics(20))) for topic in topic_list: print(topic) print("第一主题", lda.print_topic(1)) print('给定一个新文档,输出其主题分布') # test_doc = list(new_doc) #新文档进行分词 test_doc = train_docs[2] # 查看训练集中第三个样本的主题分布 print(test_doc) doc_bow = dictionary.doc2bow(test_doc) # 文档转换成bow doc_lda = lda[doc_bow] # 得到新文档的主题分布 # 输出新文档的主题分布 print(doc_lda) for topic in doc_lda: print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
# Code starts here # Calling the function topic_list, coherence_value_list = compute_coherence_values( dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5) print(coherence_value_list) # Finding the index associated with maximum coherence value max_index = coherence_value_list.index(max(coherence_value_list)) # Finding the optimum no. of topics associated with the maximum coherence value opt_topic = topic_list[max_index] print("Optimum no. of topics:", opt_topic) # Implementing LDA with the optimum no. of topic lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word=dictionary, iterations=10, passes=30, random_state=0) # pprint(lda_model.print_topics(5)) lda_model.print_topic(1)
import pandas as pd from gensim.corpora import Dictionary from gensim.models import LdaModel pos_com = pd.read_csv('data/pos_com.csv', header=None, index_col=0) neg_com = pd.read_csv('data/neg_com.csv', header=None, index_col=0) # 正向评价 pos_com.columns = ['comment'] mid = list(pos_com['comment'].str.split(' ')) dictionary = Dictionary(mid) bow = [dictionary.doc2bow(com) for com in mid] # 模型构建 pos_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) pos_model.print_topic(0) pos_model.print_topic(1) pos_model.print_topic(2) # 负面评价 neg_com.columns = ['comment'] mid = list(neg_com['comment'].str.split(' ')) dictionary = Dictionary(mid) bow = [dictionary.doc2bow(com) for com in mid] # 模型构建 neg_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) neg_model.print_topic(0) neg_model.print_topic(1) neg_model.print_topic(2)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] #Train the model on the corpus lda = LdaModel(common_corpus, num_topics=10) '''一步步拆解来看,首先common_texts是list形式,里面的每一个元素都可以认为是一篇文档也是list结构:''' print(type(common_texts)) print(common_texts[0]) '''第二步,doc2bow这个方法用于将文本转化为词袋形式,看一个官方的示例大家应该就能明白了,''' from gensim.corpora import Dictionary dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) print(dct.doc2bow(["this", "is", "máma"])) print(dct.doc2bow(["this", "is", "máma"], return_missing=True)) '''初始化的时候对每一个词都会生成一个id,新的文本进去的时候,返回该文本每一个词的id,和对应的频数,对于那些不存在原词典的,可以控制是否返回。 此时生成的corpus就相当于是LDA训练模型的输入了,让我们检查一下:''' print(common_corpus[0]) # human单词的id为0,且在第一个文档中只出现了一次 '''最后一步,我们只需调用LDA模型即可,这里指定了10个主题。''' from gensim.models import LdaModel lda = LdaModel(common_corpus, num_topics=10) '''让我们检查一下结果(还有很多种方法大家可以看文档),比如我们想看第一个主题由哪些单词构成:''' print(lda.print_topic(1, topn=2)) '''可以看出第一个模型的词分布,9号10号占比较大(这里topn控制了输出的单词个数,对应的单词可以通过之前生成dict找出) 我们还可以对刚才生成的lda模型用新语料去进行更新,''' ''' # 能更新全部参数 lda.update(other_corpus) #还能单独更新主题分布, 输入为之前的参数,其中rho指学习率 lda.update_alpha(gammat, rho) #还能单独更新词分布 lda.update_eta(lambdat, rho) '''
mydict, corpus = tf_idf(documents) # Save the Dict and Corpus #mydict.save('mydict.dict') # save dict to disk #corpora.MmCorpus.serialize('bow_corpus.mm', corpus) # save corpus to disk # Load them back ''' mydict = corpora.Dictionary.load('mydict.dict') corpus = corpora.MmCorpus('bow_corpus.mm') ''' nb_topic = 3 lda = LdaModel(corpus, id2word=mydict, num_topics=nb_topic, passes=2, per_word_topics=False, iterations=200) #lda.save('lda_model.model') ldaa = [] for i in range(0, nb_topic): d = re.findall('"([^"]*)"', lda.print_topic(i, 300)) for word in d: if len(word) > 3 and len(ldaa) % 3 != 4 and word not in ldaa: ldaa.append(word) result = {} result['lda'] = ldaa[0:20] print(result) #print(lda.show_topics(num_topics=2, num_words=500, log=True))
ax1.set_xticks(x) fig.tight_layout() plt.show() plt.savefig('work/metrics.png') # %% NUM_TOPICS = 4 lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=0) lda_model.save('work/lda.model') # %% for i in range(lda_model.num_topics): print('TOPIC:', i, '__', lda_model.print_topic(i)) # %% # WordCloud # 日本語フォントをダウンロードしてwork以下に設置 fig, axs = plt.subplots(ncols=2, nrows=math.ceil(lda_model.num_topics / 2), figsize=(16, 20)) axs = axs.flatten() def color_func(word, font_size, position, orientation, random_state, font_path): return 'darkturquoise'
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] >= 1] for text in texts] from pprint import pprint # pretty-printer dictionary = corpora.Dictionary(texts) # dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference # print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) lda = LdaModel(corpus, num_topics=2) # on a new document: new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six" new_vec = dictionary.doc2bow(new_doc.lower().split()) print(lda.print_topic(0)) print(lda.show_topic(1)) print(lda.get_document_topics(new_vec))
eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) print(model.print_topics()) for i in range(len(articleurls)): print("----") print(articleurls[i]) topics = model.get_document_topics(corpus[i]) for topic in topics: print("Topic:" + str(topic[0])) print(model.print_topic(topic[0])) print(topic[1]) print("----") #wordcloud = WordCloud( # background_color="white", # max_words=5000, # contour_width=3, # contour_color='steelblue', # width=1600, # height=800 #) # #wordcloud.generate_from_text(complete) #wordcloud.to_file('wordcloud.png')
print(lda.get_document_topics(test)) print(lda[test]) # 参数(word_id, minimum_probability=None) # 关联的topics for the given word. # Each topic is represented as a tuple of (topic_id, term_probability). print(lda.get_term_topics(0)) # ----- 输出指定topic的构成 ----- # 参数(word_id, minimum_probability=None) # 输出形式 list, format: [(word, probability), … ]. print(lda.get_topic_terms(0)) # 参数(topicno, topn=10) print(lda.show_topic(0)) # 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘. # 参数(topicno, topn=10) print(lda.print_topic(0)) # ----- 输出所有topic的构成 ----- # 默认参数(num_topics=10, num_words=10, log=False, formatted=True) # 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...] print(lda.show_topics()) # [num_topics, vocabulary_size] array of floats (self.dtype) # which represents the term topic matrix learned during inference. print(lda.get_topics()) # ----- save and load model ----- lda.save(fname="lda_model") lda.load(fname="lda_model") print(lda[test])
class LdaModelHelper: status_scheduled = 'scheduled' status_computing = 'computing' status_completed = 'completed' status_error = 'killed' default_use_lemmer = True default_min_df = 2 default_max_df = 0.8 def __init__(self, training_number_of_topics_to_extract, language, training_use_lemmer=True, training_min_df=2, training_max_df=0.8, chunksize=2000, passes=2): """ :rtype: LdaModelHelper :param training_use_lemmer: :param training_min_df: int or float, min document frequency / document proportion (if float < 1) to consider a term in the model :param training_max_df: int or float, max document frequency / document proportion (if float < 1) to consider a term in the model """ self.language = language self.analysis_use_lemmer = LdaModelHelper.default_use_lemmer self.analysis_min_df = LdaModelHelper.default_min_df self.analysis_max_df = LdaModelHelper.default_max_df self.analysis_corpus = None self.analysis_features_names = None self.analysis_documents = None self.training_number_of_topics_to_extract = training_number_of_topics_to_extract self.training_use_lemmer = training_use_lemmer self.training_min_df = training_min_df self.training_max_df = training_max_df self.chunksize = chunksize self.passes = passes self.training_corpus = None self.training_features_names = None self.analysis_documents = None self.training_documents = None self.lda_model = None self.model_computation_time = None self.topic_labels = None self.topic_assignment = None def set_analysis_parameters(self, analysis_use_lemmer=True, analysis_min_df=2, analysis_max_df=0.8): self.analysis_use_lemmer = analysis_use_lemmer self.analysis_min_df = analysis_min_df self.analysis_max_df = analysis_max_df # reset related fields self.topic_assignment = None self.topic_labels = None self.analysis_corpus = None self.analysis_features_names = None self.analysis_documents = None def generate_model_filename(self): return "_".join([ str(time.time()), str(self.training_number_of_topics_to_extract), str(self.training_min_df), str(self.training_max_df), str(self.training_use_lemmer) ]).replace('.', '') def set_lda_model(self, lda_model): self.lda_model = lda_model ##################### # Model computation ##################### def compute_lda_model(self, texts): """ Compute the lda model :return: """ if self.training_corpus is None: self.compute_corpus(texts, parameters='training') if self.training_corpus is None or len(self.training_corpus) == 0: raise Exception( 'The training corpus is empty. Tune model computation parameters.' ) start = time.time() if self.passes == 2: passes = 10 if (len(self.training_corpus) / self.chunksize) < 10 else 2 else: passes = self.passes id2word = {k: v for k, v in enumerate(self.training_features_names)} self.lda_model = LdaModel( self.training_corpus, id2word=id2word, num_topics=self.training_number_of_topics_to_extract, eval_every=1, passes=passes, chunksize=self.chunksize) end = time.time() self.model_computation_time = end - start def save_model_to_file(self, file_path): """ :type file_path: str :param file_path: the path of the models file :return: """ if self.lda_model is None: logging.error('The model has not been computed yet.') return False else: self.lda_model.save(file_path) def load_model_from_file(self, input_filepath): """ :param input_folder: :return: """ self.lda_model = LdaModel.load(input_filepath) def compute_corpus(self, texts, parameters='training'): """ Compute the corpus in gensim format considering the specified set of parameters 'training' or 'analysis'. :param parameters: :param texts: :return: """ if parameters == 'training': tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix( texts, parameters) if tf_matrix_features_names is None or len( tf_matrix_features_names) == 0: return [] self.training_corpus = matutils.Sparse2Corpus( tf_matrix, documents_columns=False) self.training_features_names = tf_matrix_features_names self.training_documents = tf_matrix_docs_ids return self.training_corpus elif parameters == 'analysis': if self.lda_model is None: logging.error('The model has not been computed yet.') return None else: # Note: words not included in the model are ignored tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix( texts, parameters) if len(tf_matrix_features_names) == 0: return [] corpus = [None] * tf_matrix.shape[0] if len(tf_matrix_features_names) != 0: word2id = { self.lda_model.id2word[id]: id for id in self.lda_model.id2word.keys() } for i in range(tf_matrix.shape[0]): doc = tf_matrix.getrow(i) _, cols = doc.nonzero() corpus[i] = [None] * len(cols) count = 0 for col in cols: if tf_matrix_features_names[col] in word2id.keys(): corpus[i][count] = (int( word2id[tf_matrix_features_names[col]]), int(tf_matrix[i, col])) count += 1 corpus[i] = corpus[i][:count] self.analysis_corpus = corpus self.analysis_features_names = tf_matrix_features_names self.analysis_documents = tf_matrix_docs_ids return self.analysis_corpus else: logging.error( "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'." ) return None def compute_corpus_single_query(self, text): """ Compute the corpus in gensim format for a single query (this implies using special parameters for preprocessing) :param text: :return: """ if self.lda_model is None: logging.error('The model has not been computed or loaded yet.') return None, None else: # Note: words not included in the model are ignored stopwords_list = lda_utils.get_stopwords(self.language) tf_matrix, tf_matrix_features_names = lda_utils.compute_tf( [text], stopwords_list, self.language, True, 1, 1.0) if len(tf_matrix_features_names) == 0: return [], tf_matrix_features_names corpus = [None] * tf_matrix.shape[0] if len(tf_matrix_features_names) != 0: word2id = { self.lda_model.id2word[id]: id for id in self.lda_model.id2word.keys() } for i in range(tf_matrix.shape[0]): doc = tf_matrix.getrow(i) _, cols = doc.nonzero() corpus[i] = [None] * len(cols) count = 0 for col in cols: if tf_matrix_features_names[col] in word2id.keys(): corpus[i][count] = (int( word2id[tf_matrix_features_names[col]]), int(tf_matrix[i, col])) count += 1 corpus[i] = corpus[i][:count] return corpus, tf_matrix_features_names def compute_tf_matrix(self, texts, parameters='training'): """ Compute the tf matrix using the specified set of parameters ('training' or 'analysis'). If texts is not specified the system tries to retrieve data directly from the associated db. :param parameters: 'training' or 'analysis' :param texts: list of strings representing texts to transform. :return: """ tf_matrix_docs_id = None if parameters == 'training' or parameters == 'analysis': stopwords_list = lda_utils.get_stopwords(self.language) if parameters == 'training': use_lemmer = self.training_use_lemmer min_df = self.training_min_df max_df = self.training_max_df else: use_lemmer = self.analysis_use_lemmer min_df = self.analysis_min_df max_df = self.analysis_max_df tf_matrix, tf_matrix_features_names = lda_utils.compute_tf( texts, stopwords_list, self.language, use_lemmer, min_df, max_df) else: logging.error( "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'." ) return None return tf_matrix, tf_matrix_features_names, tf_matrix_docs_id def compute_topic_assignment(self, texts): """ Computes the topics assignment for each document w.r.t the specified topic_model Example of output = [[(25, 0.1174058544855012), (49, 0.82926081218116554)], [(6, 0.29928250617927882), (49, 0.59405082715405444)]] :param texts: :return: """ corpus = self.compute_corpus(texts, parameters='analysis') if len(corpus) == 0: raise Exception( 'The corpus is empty. Tune analysis parameters and check stopwords.' ) computed_assignment = self.lda_model[corpus] if texts is not None: # is the corpus related to analysis parameters self.topic_assignment = computed_assignment return computed_assignment def compute_topic_assignment_for_query(self, text): corpus, _ = self.compute_corpus_single_query(text) if corpus is None or len(corpus) == 0: raise Exception( 'The corpus is empty. Tune analysis parameters and check stopwords.' ) computed_assignment = self.lda_model[corpus] return computed_assignment ####################### # Print functions ####################### def print_topic_assignment(self, topic_assignment): """ Print a topic assignment in a human readable format :param topic_assignment: :return: """ print('\tTopic importance\tTopic description') for i, doc in enumerate(topic_assignment): print('Document {0}'.format(i)) for a in doc: print() string_topic = a[ 0] if self.lda_model is None else self.lda_model.print_topic( a[0]) print('\t{1:2f}\t\t{0}'.format(string_topic, a[1])) def print_all_topics(self, num_topics=10, num_words=20, try_to_disambiguate=False, min_word_probabity_for_disambiguation=0.010): """ Print topics from a given LdaModel """ print('Print {0} topics'.format(num_topics)) print('------------') for t in self.lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False): if try_to_disambiguate: possible_labels = self.__class__.label_topic_by_probability( self.lda_model.show_topic(t[0]), min_word_probability=min_word_probabity_for_disambiguation )[:2] print('{0}:\t{1}\n'.format(t[0], possible_labels)) print('{0}\n'.format(t[1])) else: print('{0}:\t{1}\n'.format(t[0], t[1])) def get_topic_description(self, topic_id, num_words=20): """ Print topics from a given LdaModel """ if self.lda_model is None: logging.error('The model has not been computed yet.') else: return self.lda_model.show_topic(topic_id, num_words) ####################### # Labeling functions ####################### def compute_topic_labels(self, labeling_mode='mixed', min_word_probability=0.01, max_number_of_words_per_query=6, n_words_to_label=3): """ The labeling is performed querying wikipedia with a set of representative words for the topic. The words are chosen with the parameter labeling_mode: - 'based_on_probability': considers all words with a weight (probability) greater than 0.010 - 'based_on_top_words': considers the 3 most probable words for the topic - 'mixed': try with 'based_on_probability', if there are no results try with 'based_on_top_words' """ if self.lda_model is None: logging.error('No LDA model loaded.') n_labels_to_save = 3 self.topic_labels = {} # label topics for t in self.lda_model.show_topics( num_topics=self.training_number_of_topics_to_extract, num_words=40, formatted=False): topic_id = t[0] possible_labels = [] if labeling_mode == 'mixed' or labeling_mode == 'based_on_probability': possible_labels = self.__class__.label_topic_by_probability( self.lda_model.show_topic(topic_id), min_word_probability=min_word_probability, max_words=max_number_of_words_per_query)[:n_labels_to_save] if len(possible_labels) == 0: # try to disambiguate by n_words possible_labels = self.__class__.label_topic_by_number_of_words( self.lda_model.show_topic(topic_id), n_words=n_words_to_label)[:n_labels_to_save] for i in range(len(possible_labels), n_labels_to_save): # fill empty labels possible_labels.append('') self.topic_labels[topic_id] = possible_labels time.sleep(0.5) def get_topic_labels(self): if self.topic_labels is None: self.compute_topic_labels() return self.topic_labels def get_all_topics(self): """ Return a dictionary where keys are topic ids (integers) and values are words distributions. Words distribution should be a dictionary where keys are words and values are words weights within the topic :rtype: dict :return: """ topics = {} for t in self.lda_model.show_topics( num_topics=self.training_number_of_topics_to_extract, num_words=config.max_number_of_words_per_topic, formatted=False): topic_id = t[0] topic_distr = self.get_word_frequencies( self.lda_model.show_topic( topic_id, config.max_number_of_words_per_topic)) topics[topic_id] = topic_distr return topics def _get_words_distribution(self, topic_id): """ Return a a dictionary where keys are words and values are words weights within the topic :param topic_id: the topic index :rtype: dict :return: """ topic_description = self.lda_model.show_topic( topic_id, config.max_number_of_words_per_topic) return self.__class__.get_word_frequencies(topic_description) @classmethod def delete_model_files(cls, folder_path, files_prefix): """ Delete all files related to a model that have the specified file prefix :param folder_path: :param files_prefix: :rtype: :return: 200 if all files have been removed, 404 if files does not exist """ if os.path.exists(os.path.join(folder_path, files_prefix)): files_to_remove = [ files_prefix, files_prefix + ".state", files_prefix + ".expElogbeta.npy", files_prefix + ".id2word", ] for f in files_to_remove: os.remove(os.path.join(folder_path, f)) return 200 else: logging.error('[ERROR] Model files does not exists.') return 404 ####################### # Topic labeling ####################### @classmethod def label_topic_by_probability(cls, topic_description, min_word_probability=0.010, max_words=6): """ Try to disambiguate a topic considering all words with a weight greater than min_word_probability :param max_words: :param topic_description: is a list of pairs (word, word_probability) :param min_word_probability: is the minimum probability for words :return: list of strings, possible wikipedia pages """ words = [w for w, p in topic_description if p >= min_word_probability] words = words[:max_words] if len(words) == 0: # if no words are over the threshold return empty res = [] else: res = wikipedia.search(' '.join(words)) return res @classmethod def label_topic_by_number_of_words(cls, topic_description, n_words=5): """ Try to disambiguate a topic considering top k words in its description :param n_words: :param topic_description: is a list of pairs (word, word_probability) :return: list of strings, possible wikipedia pages """ words = [t[0] for i, t in enumerate(topic_description) if i < n_words] if len(words) == 0: # if no words are over the threshold, take the first words = [topic_description[0][0]] res = wikipedia.search(' '.join(words)) return res @classmethod def get_word_frequencies(cls, topic_description): """ Given a topic description, returns the corresponding dictionary with words as keys and frequencies (weight * 1000) as values. :param topic_description: list of pairs (word, word_weight) :return: """ frequencies = {w: f for w, f in topic_description} return frequencies
coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return topic_list, coherence_values # Code starts here topic_list, coherence_value_list = compute_coherence_values( dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5) print(coherence_value_list) max_index = coherence_value_list.index(max(coherence_value_list)) opt_topic = topic_list[max_index] print("Optimum no of topics: ", opt_topic) lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word=dictionary, iterations=10, passes=30, random_state=0) lda_model.print_topic(5)
'r', encoding='utf8').readlines() stopwords = [w.strip() for w in stopwords] fp = codecs.open('../../corpus/test.lsnp', 'r', encoding='utf8') for line in fp: line = line.split() train.append([w for w in line if w not in stopwords]) print(train) dictionary = Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] print(corpus[0]) lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) # 打印前20个topic的词分布 print(lda.print_topics(20)) # 打印id为20的topic的词分布 print(lda.print_topic(20)) #模型的保存/ 加载 lda.save('zhwiki_lda.model') lda = models.ldamodel.LdaModel.load('zhwiki_lda.model') # tt = 'loss of energy , motivation and no interest in work anymore - be it time to through it all in' # # test_doc = list(i for i in tt.split()) # # doc_bow = id2word.doc2bow(test_doc) #文档转换成bow # doc_lda = lda[doc_bow] #得到新文档的主题分布 # #输出新文档的主题分布 # print doc_lda # for topic in doc_lda: # print "%s\t%f\n"%(lda.print_topic(topic[0]), topic[1])
def train_lda(period='_18_09', num_topics=4, remove_top=0, tfidf=False, passes=40, iterations=600, eval_every=None): set_path() create_dictionary(dictionary_path, content_18_09_path, content_08_98_path) dictionary = corpora.Dictionary.load(dictionary_path) create_corpus(dictionary, eval('corpus_path' + period), eval('content' + period + '_path'), remove_top) corpus = corpora.MmCorpus(eval('corpus_path' + period)) temp = dictionary[0] id2word = dictionary.id2token if tfidf: corpusTfidf = convert_tfidf(eval('corpus_path' + period)) model = LdaModel(corpus=corpusTfidf, id2word=id2word, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) else: #建立模型 model = LdaModel(corpus=corpus, id2word=id2word, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) #序化模型 lda_model_file = open(eval('model_path' + period), 'wb') pkl.dump(model, lda_model_file) lda_model_file.close() #输出每个主题的关键词 top_topics = model.top_topics(corpus) print('每个主题的关键词:') pprint(top_topics) #每一行包含了主题词和主题词的权重 print('前两个主题的主题词和主题词权重:') model.print_topic(0, 10) model.print_topic(1, 10) #给训练集输出其属于不同主题概率 print('输出前十本杂志属于不同主题的概率:') for i in list(range(10)): for index, score in sorted(model[corpus[i]], key=lambda tup: -1 * tup[1]): print(index, score) #calculate perplexity testset = [] for i in range(corpus.num_docs): testset.append(corpus[i]) perplexity(model, testset, dictionary, len(dictionary.keys()), num_topics) #LDA visualization--------------------------------------------------- vis_wrapper = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.display(vis_wrapper) pyLDAvis.save_html(vis_wrapper, "lda%dtopics.html" % num_topics) pyLDAvis.show(vis_wrapper)
#600 è un parametro arbitrario, ho scelto di provare per questo per iniziare dato #che sono state trovate circa 700 parole diverse e vorrei cercare di mantenerne #il più possibile dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=600) #------------------------------------------ FASE 2 - RAPPRESENTAZIONE BoW --------------------------------------------------------------- # Converto ogni documento nella sua rappresentazione Bag of Words sfruttando il dizionario creato in precedenza bow_corpus = [dictionary.doc2bow(doc) for doc in Corpora] #------------------------------------------ FASE 3 - TRAINING DI LDA ------------------------------------------------------------------- # Alleno il modello LDA lda = LdaModel(bow_corpus, id2word=dictionary, passes=92, num_topics=28) # Printo i topic con le rispettive parole più significative appartenenti per ogni topic for idx, topic in lda.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) #------------------------------------------ FASE X - ESEMPIO DI CLASSIFICAZIONE ---------------------------------------------------------- # Faccio vedere un esempio dove prendo un documento e tramite LDA vedo in quale topic ricade # In base allo Score più alto che ottengo. print("\ndocuemnto che sto cercando di classificare") print(Corpora[1]) print("\n") for index, score in sorted(lda[bow_corpus[1]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda.print_topic(index, 10)))
#1. 读取和处理数据 from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=10) #2. 将文本转化为词袋模型 from gensim.corpora import Dictionary dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) dct.doc2bow(["this", "is", "máma"]) [(2, 1)] dct.doc2bow(["this", "is", "máma"], return_missing=True) ([(2, 1)], {u'this': 1, u'is': 1}) #3. 运用LDA模型 from gensim.models import LdaModel lda = LdaModel(common_corpus, num_topics=10) lda.print_topic(1, topn=2) # '0.500*"9" + 0.045*"10"
corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('robCSVcorpus.mm', corpus) # store to disk, for later use #import numpy as np #corpusnp=np.array(corpus) #print len(corpusnp), len(np.delete(corpusnp, 1, axis=0)) #Initialize the transformation #term freq inverse doc freq #trying to ind the frequency on that page versus overall frequency lda = LdaModel(corpus, id2word=dictionary,num_topics=numTopics) ii=0 print 'These are the topics' for i in range(0, lda.num_topics): print lda.print_topic(i,topn=20) #sys.exit() doc_lda = [] for i in range(len(corpus)): doc_lda.append(lda[corpus[i]]) #print(doc_lda) ''' This will simply put the tuples in a csv file, poor format ''' with open('CorpusTopicsOld.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile) for i in range(len(corpus)):
alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) #输出每个主题的关键词 top_topics = model.top_topics(corpus) pprint(top_topics) #序化模型 lda_model_file = open(path_model_pkl, 'wb') pkl.dump(model, lda_model_file) lda_model_file.close() model = read_pkl(path_ldamodel) #每一行包含了主题词和主题词的权重 model.print_topic(0, 10) model.print_topic(1, 10) #判断一个训练集文档属于哪个主题 for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]): print("Score: {}\t Topic: {}".format(score, model.print_topic(index, 10))) #给训练集输出其属于不同主题概率 for i in list(range(10)): for index, score in sorted(model[corpus[i]], key=lambda tup: -1 * tup[1]): print(index, score) #LDA visualization---------------------------------------------------
class LDAModeling(BaseModel): def __init__(self, training_data, num_topics, alpha=0.01, passes=20): super().__init__(training_data) self.lda_model = None self.num_topics = num_topics self._corpus = [] self._dictionary = [] self._passes = passes self._alpha = alpha # self.tokenization() def build_lda_model(self): self._dictionary = corpora.Dictionary(self._documents) self._dictionary.compactify() # assign new word ids to all words. This is done to make the ids more compact self._corpus = [self._dictionary.doc2bow(doc) for doc in self._documents] self.lda_model = LdaModel(corpus=self._corpus, id2word=self._dictionary, num_topics=self.num_topics, alpha=self._alpha, passes=self._passes, minimum_probability=0) def set_topics(self, text_parser, emo_only_index): print("[*] Setting topic for each utterance...") emo_topics = [] emo_list = [e[0] for e in text_parser.emotes] for i in range(len(text_parser.utterances)): if emo_only_index[i] == 1: text_parser.utterances[i].append(self.num_topics) for word in text_parser.utterances[i][0].split(): if word.lower() in emo_list: emo_topics.append((word.lower(), 0)) else: topic = self.query_topic(text_parser.utterances[i][0]) # topic: 0 ~ topic_num-1 text_parser.utterances[i].append(topic) emo_topics = list(set(emo_topics)) topics_dict = self._get_topics_and_distribution() topics_dict[self.num_topics] = emo_topics return topics_dict def query_topic(self, query): # Similarity Queries query = self._dictionary.doc2bow(query.lower().split()) topic, probability = list(sorted(self.lda_model[query], key=lambda x: x[1]))[-1] return topic def _get_topics_and_distribution(self): topics = {} for i in range(self.num_topics): s = self.lda_model.print_topic(i, topn=10) topics[i] = [] for t in s.split('+'): topics[i].append((t.strip().split('*')[1], float(t.strip().split('*')[0]))) return topics def print_topic(self, topic_no, top_n=5): if self.lda_model: self.lda_model.print_topic(topic_no, top_n) def save_topics(self, filename, threshold, topics_dict): with open(filename, "w") as f: for i in range(self.num_topics): result = "" for t_d in topics_dict[i]: if t_d[1] >= threshold: result += t_d[0] + " " f.write(result.rstrip()+"\n") f.write(" ".join([e[0] for e in topics_dict[self.num_topics]]))
] for document in documents] all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus, id2word=dictionary, num_topics=10) #print lda.print_topics(num_words=100,num_topics=10) for i in range(0, lda.num_topics): print lda.print_topic(i) string_topic = lda.print_topic(i) topic_words = re.findall(r'\b[a-z]+\b', string_topic) word_score = re.findall(r'0.\d+', string_topic) tp_number = i + 1 index = 0 while index < len(topic_words): cursor.execute( "insert into lda_topic(word,score,topic_number) values (?,?,?)", ( topic_words[index], word_score[index], tp_number, )) index = index + 1 db.commit()