def explore(parameters, run): print(parameters) no_above = parameters["no_above"] chunksize = parameters["chunksize"] passes = parameters["passes"] iterations = parameters["iterations"] size = parameters["size"] num_topics = parameters["num_topics"] with open(fname, 'a', newline='', encoding='utf-8') as csv_file: run += 1 print("Run " + str(run) + " out of " + str(runs)) writer = csv.writer(csv_file) corpora.Dictionary.filter_extremes(dictionary, no_below=no_below, no_above=no_above, keep_tokens=None) corpus = [dictionary.doc2bow(review) for review in reviews] corpora.MmCorpus.serialize(name + '.mm', corpus) mm = corpora.MmCorpus( name + '.mm') # `mm` document stream now has random access mm_used = mm[:size] writer.writerows([[ "Data size", "Topics", "no_above", "Chunksize", "Passes", "Iteration" ], [size, num_topics, no_above, chunksize, passes, iterations], []]) lda = LdaModel(mm_used, num_topics=num_topics, chunksize=chunksize, id2word=dictionary, passes=passes, iterations=iterations, eval_every=eval_every) lst = [] for topic in LdaModel.print_topics(lda, -1, 10): terms = [ x[0] for x in LdaModel.get_topic_terms(lda, topic[0], topn=10) ] term_strings = [str(dictionary[term]) for term in terms] str_topic = [] str_topic.append("Topic " + str(topic[0] + 1)) str_topic.extend(term_strings) lst.append(str_topic) writer.writerows(zip(*lst)) writer.writerow([]) return run
class MyLda: def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15): self.num_topics = num_topics self.topic_threshold = topic_threshold self.myDictionary = myDictionary self.model = LdaModel(self.myDictionary.doc2bows, \ id2word=self.myDictionary.dictionary, \ num_topics=num_topics) self.topic2ids, self.id2topics = self.get_mappings() self.coherenceModel = None print("- Created MyLda with {} topics".format(self.num_topics)) def get_mappings(self): topic2ids, id2topics = defaultdict(list), defaultdict(list) for i, doc2bow in enumerate(self.myDictionary.doc2bows): topic_pairs = self.model.get_document_topics(doc2bow) for j, (topic, prob) in enumerate(topic_pairs): if prob >= self.topic_threshold or j == 0: topic2ids[topic].append(i) id2topics[i].append(topic) return topic2ids, id2topics def get_topic_terms(self, topic): terms = self.model.get_topic_terms(topic) return terms def get_top_topic(self): top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows) average = sum([t[1] for t in top_topics]) / self.num_topics return top_topics, average def get_perplexity(self): return self.model.log_perplexity(self.myDictionary.doc2bows) def get_coherence(self): if not self.coherenceModel: self.coherenceModel = CoherenceModel(model=self.model, \ corpus=self.myDictionary.doc2bows, \ dictionary=self.myDictionary.dictionary, \ coherence='u_mass') return self.coherenceModel.get_coherence()
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5, learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]: """ lda_topics perfoms LDA topic modeling on the input data :param processed_data: list of preprocessed segments :param n_topics: number of topics to extract form corpus :param learning_decay: learning decay parameter for LDA :param learning_offset: learning offset parameter for LDA :param max_iter: max. number of interations :param n_words: number of topic representatives :return: - topics - list of topics (and their representatives - doc_topics - list of predicted topics, one for each segment """ dictionary = corpora.Dictionary(processed_data, ) doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data] lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset, random_state=42, update_every=1, iterations=max_iter, passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True) topics = [] for i_t, topic_word_dist in enumerate(lda_model.get_topics()): topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)] topics.append(topic) # getting documents topic labels doc_topics = [] for doc in doc_term_matrix: doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True) t, _ = doc_t_dist[0] doc_topics.append(t) assert len(doc_topics) == len(processed_data) return topics, doc_topics
], [ size, num_topics, no_above, chunksize, passes, iterations ], []]) lda = LdaModel(mm_used, num_topics=num_topics, chunksize=chunksize, id2word=dictionary, passes=passes, iterations=iterations, eval_every=eval_every) lst = [] for topic in LdaModel.print_topics(lda, -1, 10): terms = [ x[0] for x in LdaModel.get_topic_terms( lda, topic[0], topn=10) ] term_strings = [ str(dictionary[term]) for term in terms ] str_topic = [] str_topic.append("Topic " + str(topic[0] + 1)) str_topic.extend(term_strings) lst.append(str_topic) writer.writerows(zip(*lst)) writer.writerow([])
random_state=100, num_topics=10, passes=5, chunksize=10000, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) ## See the topics lda_model.print_topics(-1) #this allows to observe the topics lda_model.get_topic_terms(0, topn=10) # this provides the top 10 words in topic 0 lda_model.log_perplexity(corpus) # this compute the log perplexity lda_model.get_document_topics( corpus[0] ) # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed lda_model.get_document_topics( corpus[0], minimum_probability=0 ) # This provide the document topic distribution. Here, every topics and associated probabilities are printed. ### Document topic #### # Plotting tools import pyLDAvis import pyLDAvis.gensim # don't skip this import matplotlib.pyplot as plt vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) pyLDAvis.show(vis)
def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False, num_of_topics=8, passes=25, verbose=True): path = os.getcwd()[:os.getcwd().rfind('/')] topics_filename = str(num_of_topics) + "topics" if use_nouns: topics_filename += "_nouns" if use_verbs: topics_filename += "_verbs" if use_all: topics_filename += "_all" # Set the LDA, Dictionary and Corpus filenames lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model" dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict" corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm" # Build a topic model if it wasn't created yet if not os.path.exists(lda_filename): # Extract the lemmatized documents docs = [] for index in range(len(tokens_tags)): tokens = tokens_tags[index].split() pos = pos_tags[index].split() docs.append( data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all)) # Compute the dictionary and save it dictionary = Dictionary(docs) dictionary.filter_extremes(keep_n=40000) dictionary.compactify() Dictionary.save(dictionary, dict_filename) # Compute the bow corpus and save it corpus = [dictionary.doc2bow(d) for d in docs] MmCorpus.serialize(corpus_filename, corpus) if verbose: print("\nCleaned documents:", docs) print("\nDictionary:", dictionary) print("\nCorpus in BoW form:", corpus) # Start training an LDA Model start = time.time() print("\nBuilding the LDA topic model...") lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) lda_model.save(lda_filename) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) if verbose: print("\nList of words associated with each topic:") lda_topics = lda_model.show_topics(formatted=False) lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics] print([t for t in lda_topics_list]) # Load the previously saved dictionary dictionary = Dictionary.load(dict_filename) # Load the previously saved corpus mm_corpus = MmCorpus(corpus_filename) # Load the previously saved LDA model lda_model = LdaModel.load(lda_filename) # Print the top 10 words for each topic if verbose: for topic_id in range(num_of_topics): print("\nTop 10 words for topic ", topic_id) print([ dictionary[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10) ]) index = 0 if verbose: for doc_topics, word_topics, word_phis in lda_model.get_document_topics( mm_corpus, per_word_topics=True): print('Index ', index) print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', word_phis) print('-------------- \n') index += 1 return dictionary, mm_corpus, lda_model
def run_lda_with_entropy(industry_lda, token_dict, max_k=5): common_dictionary = corpora.Dictionary(industry_lda) common_corpus = [common_dictionary.doc2bow(text) for text in industry_lda] ldamodel = LdaModel(corpus=common_corpus, num_topics=max_k + 1, id2word=common_dictionary) result = ldamodel.print_topics(num_topics=max_k + 1, num_words=10) center_lst = [] for i in range(max_k + 1): result2 = ldamodel.get_topic_terms(topicid=i) sum_word = 0 center = 0 length = len(result2) for v in result2: if common_dictionary[v[0]] in token_dict.keys(): center += token_dict[common_dictionary[v[0]]] center_lst.append(center / length) industry_with_center_distance = [] sum_temp5_lst = [] for i in industry_lda: temp2 = [] for k in i: temp = [] if k in token_dict.keys(): for j in center_lst: temp.append((cal_sim(np.array(token_dict[k]), j))) if len(temp) > 0: temp2.append(temp) if len(temp2) > 0: temp3 = np.array(temp2) temp4 = np.mean(temp3, axis=0) temp5 = np.sum(temp3) else: temp4 = [] for i in range(0, max_k + 1): temp4.append(0.0) temp5 = temp4 industry_with_center_distance.append(temp4) sum_temp5_lst.append(temp5) entro_result_final = {} for number, i in enumerate(industry_lda): entro_result_2 = [] for k in i: entro_result = [] if k in token_dict.keys(): for j in center_lst: temp = cal_sim(np.array(token_dict[k]), j) temp_value = temp / sum_temp5_lst[number] entro_result.append(temp_value * math.log(temp_value)) entro_result_2.append(entro_result) if len(entro_result_2) > 0: temp5 = np.zeros(shape=(1, max_k + 1), dtype=float) for w in entro_result_2: if len(w) > 0: temp4 = np.array(w) temp5 += temp4 list_temp5 = list(temp5[0]) entro_result_final[number] = list_temp5.index(max(list_temp5)) final_result = defaultdict(list) for i in range(0, max_k + 1): for key, value in entro_result_final.items(): if value == i: final_result[i].append(industry_lda[key])
model = LdaModel( corpus=corpus, id2word=dictionary.id2token, chunksize=1000, alpha='asymmetric', eta='auto', iterations=iterations, num_topics=args.num_topics, passes=passes, eval_every=None ) topic_tokens = [] for topicid in range(args.num_topics): topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025]) paper_topic_data = [] for paper, paper_bow in zip(data, corpus): topic_distr = model.get_document_topics(paper_bow, minimum_probability=0) paper_topic_data.append({ "key": paper["key"], "year": paper["year"], "title": paper["title"], "topic_distr": {t: float(p) for t, p in topic_distr} }) with open(args.outpath, 'w') as f: json.dump({ "topics": topic_tokens, "paper_data": paper_topic_data
# minimum_phi_value (float) – if per_word_topics is True, this represents a lower bound on the term probabilities that are included (None by default). If set to None, a value of 1e-8 is used to prevent 0s. # Returns: # topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples. test = dct.doc2bow("I love Kitten".lower().strip().split()) print(lda.get_document_topics(test)) print(lda[test]) # 参数(word_id, minimum_probability=None) # 关联的topics for the given word. # Each topic is represented as a tuple of (topic_id, term_probability). print(lda.get_term_topics(0)) # ----- 输出指定topic的构成 ----- # 参数(word_id, minimum_probability=None) # 输出形式 list, format: [(word, probability), … ]. print(lda.get_topic_terms(0)) # 参数(topicno, topn=10) print(lda.show_topic(0)) # 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘. # 参数(topicno, topn=10) print(lda.print_topic(0)) # ----- 输出所有topic的构成 ----- # 默认参数(num_topics=10, num_words=10, log=False, formatted=True) # 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...] print(lda.show_topics()) # [num_topics, vocabulary_size] array of floats (self.dtype) # which represents the term topic matrix learned during inference. print(lda.get_topics()) # ----- save and load model -----
#-*- coding: utf-8 -*- import pickle from gensim.corpora import Dictionary from gensim.models import LdaModel with open("../data/corpus_test.pkl", "rb") as f: corpus = pickle.load(f) corpus_dictionary = Dictionary(corpus) corpus = [corpus_dictionary.doc2bow(text) for text in corpus] CORPUS = corpus TOPIC_NUM = 10 lda = LdaModel(corpus=CORPUS, num_topics=TOPIC_NUM) doc_topic_matrix = lda.get_document_topics([(0, 1), (1, 1)]) term_topic_matrix = lda.get_term_topics(1) topic_term_matrix = lda.get_topic_terms(1)
raw_texts = methods.load_data(data_file) processed_texts = methods.clean_data(raw_texts) dictionary, corpus = methods.get_dict(processed_texts) ## create LDA model ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=NUMTOPICS, passes=10) # examine learned topics topics_list = [] for topic_ind in range(NUMTOPICS): topic = ldamodel.get_topic_terms(topic_ind, NUMTERMS) topics_list.append([dictionary[pair[0]] for pair in topic]) print("Topic", topic_ind, ":", topics_list[topic_ind]) # average coherence of the learned topics #since we filtered the dictionary, some words in the processed texts are not in the dictionary. We will create a new dictionary for coherence use only dictionary_coh = Dictionary(processed_texts) coh = CoherenceModel(topics=topics_list, texts=processed_texts, dictionary=dictionary_coh, coherence=coh_metric).get_coherence() print("-" * 10) # Coherence will be small since the data we are using here is small and will not produce representative topics. print("(Ranked using Rank_orig) Topics Coherence Score %r %r \n" % (coh_metric, coh))
import networkx as nx import numpy as np from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models import LdaModel g = nx.Graph() g.add_edges_from([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3], [4, 5], [4, 6], [4, 7], [5, 6], [5, 7], [6, 7], [3, 4]]) sentences = [] for node in g.nodes(): sentences.append([str(nb) for nb in g.neighbors(node)]) # Create a corpus from a list of texts common_dictionary = Dictionary(sentences) common_corpus = [common_dictionary.doc2bow(text) for text in sentences] lda = LdaModel(common_corpus, num_topics=2, eta=0.001, alpha=[0.001, 0.001]) s = lda.get_topic_terms(topicid=0, topn=g.number_of_nodes()) token2id = common_dictionary.token2id id2node = {token2id[token]: token for token in token2id} print(s) print([(id2node[p[0]], p[1]) for p in s])
def lda_train(p_generate, theta_generate, phi_generate, num_topics, num_docs): import matplotlib.pyplot as plt from gensim.models import LdaModel, LdaMulticore import gensim.downloader as api from gensim.utils import simple_preprocess, lemmatize import nltk from nltk.corpus import stopwords from gensim import corpora import re import pyLDAvis import logging import numpy as np import scipy import sys from itertools import permutations from gensim.models import CoherenceModel np.set_printoptions(threshold=sys.maxsize) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) if __name__ == '__main__': __spec__ = None #Load dictionnary and corpus dct = corpora.Dictionary.load('dct.dict') corpus = corpora.MmCorpus('corpus.mm') num_words = len(dct) # Step 4: Train the LDA model lda_model = LdaModel(corpus=corpus, id2word=None, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True, minimum_probability=0) # save the model lda_model.save('lda_model.model') # See the topics i = 0 theta_matrix = np.zeros((num_docs, num_topics)) for c in lda_model[corpus]: print(i) print("Document Topics : ", c[0]) # [(Topics, Perc Contrib)] for j in range(theta_matrix.shape[1]): theta_matrix[i, j] = c[0][j][1] i = i + 1 # print("Word id, Topics : ", c[1][:]) # [(Word id, [Topics])] #print("Phi Values (word id) : ", c[2][:]) # [(Word id, [(Topic, Phi Value)])] # print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:]]) # [(Word, [Topics])] # print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:]]) # [(Word, [(Topic, Phi Value)])] # print("------------------------------------------------------\n") for j in range(num_topics): print("Topic", j) for i in range(len(lda_model.get_topic_terms(j, 10))): print(dct[lda_model.get_topic_terms(j, 10)[i][0]], lda_model.get_topic_terms(j, 10)[i][1]) phi_matrix = lda_model.get_topics() row_sums = theta_matrix.sum(axis=1) theta_matrix_new = theta_matrix / row_sums[:, np.newaxis] p = np.matmul(theta_matrix_new, phi_matrix) p_logit = scipy.special.logit(p) for i in range(p_logit.shape[0]): print(i) print(p_logit[i, ]) p_logit_generate = np.load('p_logit_generate.npy') p_generate = np.load('p_generate.npy') theta_generate = np.load('theta_generate.npy') phi_generate = np.load('phi_generate.npy') corr_p = np.zeros((1, num_docs)) corr_p_logit = np.zeros((1, num_docs)) cosine_p = np.zeros((1, num_docs)) for i in range(p_logit.shape[0]): corr_p_logit[0, i] = np.corrcoef(p_logit[i, ], p_logit_generate[i, ])[1, 0] corr_p[0, i] = np.corrcoef(p[i, ], p_generate[i, ])[1, 0] cosine_p[0, i] = scipy.spatial.distance.cosine(p[i, ], p_generate[i, ]) corr_avg_p_inter = np.mean(corr_p) cosine_avg_p_inter = np.mean(cosine_p) corr_avg_p_logit_inter = np.mean(corr_p_logit) corr_avg_p_wordDist = np.mean( np.corrcoef(p) ) #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc) corr_avg_p_docDist = np.mean( np.corrcoef(np.transpose(p)) ) #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen) corr_avg_pgenerate_wordDist = np.mean( np.corrcoef(p_generate) ) #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc) corr_avg_pgenerate_docDist = np.mean( np.corrcoef(np.transpose(p_generate)) ) #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen) theta = theta_matrix_new phi = phi_matrix #This section is to compile to correlation and cosine of each column arrangment combination of a 3 topic model (theta_matrix) compilation_corr_theta = [] compilation_cosine_theta = [] compilation_corr_phi = [] compilation_cosine_phi = [] compilation_KL_theta = [] compilation_KL_phi = [] l = list(permutations(range(1, num_topics + 1))) for combi in range(len(l)): v_theta = np.zeros([num_docs, num_topics]) v_phi = np.zeros([num_topics, num_words]) for tid in range(num_topics): v_theta[:, tid] = theta[:, l[combi][tid] - 1] v_phi[tid, :] = phi[l[combi][tid] - 1, :] corr_theta = np.zeros((1, num_docs)) cosine_theta = np.zeros((1, num_docs)) KL_theta = np.zeros((1, num_docs)) corr_phi = np.zeros((1, num_topics)) cosine_phi = np.zeros((1, num_topics)) KL_phi = np.zeros((1, num_topics)) for i in range(theta_generate.shape[0]): corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1, 0] cosine_theta[0, i] = scipy.spatial.distance.cosine( v_theta[i, :], theta_generate[i, :]) KL_theta[0, i] = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :]) compilation_corr_theta.append(corr_theta.mean()) compilation_cosine_theta.append(cosine_theta.mean()) compilation_KL_theta.append(KL_theta.mean()) for i in range(phi_generate.shape[0]): corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0] cosine_phi[0, i] = scipy.spatial.distance.cosine( v_phi[i, :], phi_generate[i, :]) KL_phi[0, i] = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :]) compilation_corr_phi.append(corr_phi.mean()) compilation_cosine_phi.append(cosine_phi.mean()) compilation_KL_phi.append(KL_phi.mean()) compilation_cosine_phi = np.array(compilation_cosine_phi) compilation_corr_phi = np.array(compilation_corr_phi) compilation_KL_phi = np.array(compilation_KL_phi) compilation_cosine_theta = np.array(compilation_cosine_theta) compilation_corr_theta = np.array(compilation_corr_theta) compilation_KL_theta = np.array(compilation_KL_theta) alignment = compilation_KL_phi.argmin() if alignment != compilation_cosine_phi.argmin( ) | alignment != compilation_cosine_theta.argmin( ) | alignment != compilation_corr_theta.argmax( ) | alignment != compilation_corr_phi.argmax( ) | alignment != compilation_KL_theta.argmin(): print('Warning!!! The alignments are not coherents.') #Determining the final correlation and cosine values v_theta = np.zeros([num_docs, num_topics]) v_phi = np.zeros([num_topics, num_words]) for tid in range(num_topics): v_theta[:, tid] = theta[:, l[alignment][tid] - 1] v_phi[tid, :] = phi[l[alignment][tid] - 1, :] corr_theta = np.zeros((1, num_docs)) cosine_theta = np.zeros((1, num_docs)) KL_theta = np.zeros((1, num_docs)) corr_phi = np.zeros((1, num_topics)) cosine_phi = np.zeros((1, num_topics)) KL_phi = np.zeros((1, num_topics)) for i in range(theta_generate.shape[0]): corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1, 0] cosine_theta[0, i] = scipy.spatial.distance.cosine( v_theta[i, :], theta_generate[i, :]) KL_theta = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :]) for i in range(phi_generate.shape[0]): corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0] cosine_phi[0, i] = scipy.spatial.distance.cosine(v_phi[i, :], phi_generate[i, :]) KL_phi = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :]) corr_theta = corr_theta.mean() cosine_theta = cosine_theta.mean() KL_theta = KL_theta.mean() corr_phi = corr_phi.mean() cosine_phi = cosine_phi.mean() KL_phi = KL_phi.mean() words_id = np.arange(num_words) #coherence_model_lda=CoherenceModel(model=lda_model,texts=corpus,dictionary=dct,coherence='c_v') #coherence_lda=coherence_model_lda.get_coherence() #print('\nCoherence Score: ', coherence_lda) return (v_phi, corr_theta, corr_phi, cosine_theta, cosine_phi, KL_theta, KL_phi)