top_words_per_topic = [] for t in range(lda_model.num_topics): top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn=5)]) ###### import pyLDAvis.gensim # print(pyLDAvis.gensim.prepare(lda_model, train_corpus, train_dictionary)) LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, train_corpus, train_dictionary) # pyLDAvis.show(LDAvis_prepared) # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=lda_model, texts=train_doc_list, dictionary=train_dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=lda_model, texts=train_doc_list, dictionary=train_dictionary, coherence="u_mass") coherence_lda = coherence_model_lda.get_coherence() print('\nUMass Coherence Score: ', coherence_lda) def compute_coherence_values(dictionary, corpus,
import pickle import gensim from gensim import corpora, models from multiprocessing import freeze_support with open('./pickle/dictionary.pkl', 'rb') as f: dictionary = pickle.load(f) import pandas as pd processed_docs = pd.read_pickle("./pickle/processed_docs.pkl") from gensim.models.coherencemodel import CoherenceModel lda_model_mallet = gensim.models.wrappers.LdaMallet.load( './pickle/lda_model_mallet') # Compute Perplexity # print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score if __name__ == '__main__': freeze_support() coherence_model_lda_mallet = CoherenceModel(model=lda_model_mallet, texts=processed_docs, dictionary=dictionary, coherence='c_v') coherence_lda_mallet = coherence_model_lda_mallet.get_coherence() print('\nCoherence Score: ', coherence_lda_mallet)
def main(): # adjust the path below to wherever you have the transcripts2018 folder document_list, file_name = load_data_from_dir( "ted-transcripts/transcripts/") print(len(document_list)) # I've added extra stopwords here in addition to NLTK's stopword list - you could look at adding others. doc_clean = preprocess_data(document_list, {'laughter', 'applause'}) dictionary, doc_term_matrix = prepare_corpus(doc_clean) number_of_topics = 0 # adjust this to alter the number of topics # words=20 #adjust this to alter the number of words output for the topic below # runs LDA using Mallet from gensim using the number_of_topics specified above - this might take a couple of minutes # you can create additional variables eg ldamallet to store models with different numbers of topics # ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet) # coherencemodel = CoherenceModel(model=gensimmodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') # coherence_lda = coherencemodel.get_coherence() # print('\nCoherence Score: ', coherence_lda) min_k = 5 max_k = 15 intervals = 5 coherences = {} coherence_lda = {} max_coherence = 0 for i in range(min_k, max_k, intervals): ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=i, id2word=dictionary) gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( ldamallet) coherences[i] = CoherenceModel(model=gensimmodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda[i] = coherences[i].get_coherence() #identify best coherence score and save the model if coherence_lda[i] > max_coherence: max_coherence = coherence_lda[i] ldamalletbest = ldamallet gensimmodelbest = gensimmodel coherencebest = coherences[i] number_of_topics = i for k in coherence_lda: print('\nCoherence Score for topic count ', k, ':', coherence_lda[k]) print('best coherence:', max_coherence) ldamalletbest.show_topics(num_topics=number_of_topics, num_words=20) ldamalletbest.print_topics() # convert the coherence scores to a pandas dataframe df = pd.DataFrame.from_dict(coherence_lda, orient='index', columns=['Coherence']) df['Topics'] = df.index # plot the result df.plot(kind='line', x='Topics', y='Coherence') plt.show() text_name = '2018-03-03-kriti_sharma_how_to_keep_human_biases_out_of_ai.txt' #name of file need to be checked #text_name ='2012-09-14-timothy_bartik_the_economic_case_for_preschool.txt' #name of file need to be checked doc_id = file_name.index(text_name) # index of document to explore print(file_name[3138]) document_topics = gensimmodelbest.get_document_topics( doc_term_matrix[doc_id]) # substitute other models here document_topics = sorted(document_topics, key=lambda x: x[1], reverse=True) # sorts document topics model_doc_topics = gensimmodelbest.get_document_topics( doc_term_matrix) # substitute other models here lda_index = similarities.MatrixSimilarity(model_doc_topics.corpus) # query for our doc_id from above similarity_index = lda_index[doc_term_matrix[doc_id]] # Sort the similarity index similarity_index = sorted(enumerate(similarity_index), key=lambda item: -item[1]) for i in range(1, 6): document_id, similarity_score = similarity_index[i] print('Document Index: ', document_id) print('Document Name: ', file_name[document_id]) print('Similarity Score', similarity_score) print(re.sub('\s+', ' ', document_list[document_id][:500]), '...') # preview first 500 characters print()
def job_comp_sims(): texts = read_comp_doc() # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts.values()) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts.values()] # generate LDA model ldamodel = models.ldamodel.LdaModel(corpus, num_topics=40, id2word=dictionary, passes=50) #print(ldamodel.print_topics(num_topics=5)) cm = CoherenceModel(model=ldamodel, texts=texts.values(), dictionary=dictionary, coherence='c_v') print(cm.get_coherence()) res_path = os.getcwd() + '/../results/lda/jobcomp/' if not os.path.exists(res_path): os.makedirs(res_path) res_file_path = res_path + "jobcomp.csv" res_file = Path(res_file_path) if res_file.is_file(): os.remove(res_file_path) job_count = 0 ### compute similarities for a few jobs from old jobs for filename in glob.glob(JOB_PATH + '*.json'): if job_count < 5: job_text = tokenize_clean(filename, 'json') job_bow = dictionary.doc2bow(job_text) job_lda = ldamodel[job_bow] index = similarities.MatrixSimilarity(ldamodel[corpus]) sims = index[job_lda] comps = texts.keys() comp_count = 0 with open(res_file_path, mode="a") as text_file: for doc, sim in enumerate(sims): text_file.write( str(job_count) + "," + str(comp_count) + "," + comps[doc] + "," + str(sim) + "\n") comp_count += 1 job_count += 1 ### compute similarities for new jobs for filename in glob.glob(NEWJOB_PATH + '*.json'): job_text = tokenize_clean(filename, 'json') job_bow = dictionary.doc2bow(job_text) job_lda = ldamodel[job_bow] index = similarities.MatrixSimilarity(ldamodel[corpus]) sims = index[job_lda] comps = texts.keys() comp_count = 0 with open(res_file_path, mode="a") as text_file: for doc, sim in enumerate(sims): text_file.write( str(job_count) + "," + str(comp_count) + "," + comps[doc] + "," + str(sim) + "\n") comp_count += 1 job_count += 1
def find_best_model_cv(n_topic_range, texts, id2word, corpus, threshold=None, random_state=42, plot=True, verbose=False): """ Searches for the best model in a given range by C_v coherence value Parameters: - `n_topic_range` a range of values for the `num_topics` parameter of a gensim LDA model to try - `texts` a list of documents broken into words - `id2word` a dictionary containing word encodings - `corpus` the result of mapping each word in `texts` to its value in `id2word` - `random_state` a random state for use in a gensim LDA model - `threshold` a float that specifies a coherence value that if reached will cause the function to return early - `plot` a boolean specifying whether or not to plot coherence values against each `num_topics` value - `verbose` a boolean specifying whether or not to print updates Returns: a tuple containing the best model, the list of all models attempted, and a list of all coherence values obtained, respectively. """ models = [] coherence_vals = [] for n_topics in n_topic_range: # Print percentage progress if verbose: diff = max(n_topic_range) - n_topic_range.start print( str(round(100 * (n_topics - n_topic_range.start) / diff, 1)) + "% done") lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=random_state, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) co_model = CoherenceModel(lda_model, texts=texts, dictionary=id2word, coherence="c_v") coherence = co_model.get_coherence() models.append(lda_model) coherence_vals.append(coherence) if threshold is not None and coherence > threshold: if verbose: print('Returning early with a coherence value of ' + str(coherence)) if plot: actual_range = range(n_topic_range.start, n_topics + n_topic_range.step, n_topic_range.step) plt.plot(actual_range, coherence_vals, 'b') plt.show() return lda_model, models, coherence_vals if plot: # The portion of the range that was actually iterated through plt.plot(n_topic_range, coherence_vals, 'b') plt.show() return models[np.argmax(coherence_vals)], models, coherence_vals
#%%Topic Modeling from sklearn.decomposition import TruncatedSVD from gensim.models.coherencemodel import CoherenceModel from nltk.tokenize import sent_tokenize, word_tokenize # SVD represent documents and terms in vectors svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122) svd_model.fit(X) len(svd_model.components_) text = sent_tokenize(''.join(list(data['processed_data'].values))) coherencemodel = CoherenceModel(model=svd_model, texts=text) #%% from gensim.test.utils import common_corpus, common_dictionary from gensim.models.ldamodel import LdaModel from gensim.models.coherencemodel import CoherenceModel m1 = LdaModel(common_corpus, 3, common_dictionary) m2 = LdaModel(common_corpus, 5, common_dictionary) cm = CoherenceModel.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
# read topics for evaluating Weeks = "W10 W10W11 W11W12 W12W13 W13W14 W14W15 W15W16 W16W17 W17W18 W18W19 W19W20 W20W21 W21W22 W22W23 W23W24 W24W25 W25W26".split( " ") topics = [] for week in Weeks: summary_file = f"{model_path}{week}{summary_suffix}" df = pd.read_csv(summary_file) list_relevant_words = df.relevant_words.to_list() for each_relevant_words in list_relevant_words: topics.append(each_relevant_words.split(" ")) # test block # coherences = [] # for week in Weeks: # coherences += [week[-2:]]*num_topics r_corpus = MmCorpus(output_fname) with Timer(): cm = CoherenceModel(topics=topics, corpus=r_corpus,\ dictionary=dct, coherence='u_mass') coherences = cm.get_coherence_per_topic() # write results wf = open("./coherence-evals.txt", "w") for i, week in enumerate(Weeks): wf.write(f"{week}\n") cs = coherences[i * num_topics:(i + 1) * num_topics] for c in cs: wf.write(f"{c}\n") wf.close()
#this is where you will start processing the data and apply LDA technique """ documents = data_df['hsp_account_id'].tolist() dictionary = gensim.corpora.Dictionary(documents) dictionary.filter_extremes(no_below = 190, no_above = 0.7) dictionary.save("saveCorporaDicV4_20_250") corpus = [dictionary.doc2bow(text) for text in documents] #this is where you will apply the actual model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = dictionary, passes = 250) ldamodel.save('OpioidLdamodelV4_20_250.lda') lda = gensim.models.ldamodel.LdaModel.load('SUD_20_250_70.lda') cm = CoherenceModel(model = lda, texts = documents, corpus = corpus, coherence = 'c_v') coherence = cm.get_coherence() ldaP = lda.print_topics(num_topics = 20, num_words = 25) print(coherence)
######################### Prepared #################### #convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] #to have logs #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ################# LDA ##################### #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20,id2word=dictionary, update_every=1, chunksize=100, passes=1) ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2,id2word=dictionary, minimum_probability=0.01) ldamodel.print_topics(num_topics = 2, num_words = 20) cm = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided. print cm.get_coherence() #print "...........................get__document_topics....................." #print ldamodel.get_document_topics(corpus[100], minimum_probability=None) # ################ NMF ##################### # model = NMF(n_components=2, init='random', random_state=0) # model.fit(dictionary) # NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200, # n_components=2, nls_max_iter=2000, random_state=0, shuffle=False, # solver='cd', sparseness=None, tol=0.0001, verbose=0)
corpus.append(dictionary.doc2bow(tokens)) print("Corpus and dictionary created!") for num_topic in num_topics: print("Number of topics: ", num_topic) ldamodel = LdaModel(corpus=corpus, num_topics=num_topic, id2word=dictionary, iterations=20) topics = ldamodel.print_topics(num_words=10) num_topic_to_topics[num_topic] = topics num_topic_to_models[num_topic] = ldamodel coherence_model_lda = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() print('Coherence Score (U_Mass): ', coherence_lda) pickle.dump(num_topic_to_models, open("num_topic_to_models.dict", "wb")) """ Number of topics: 20 Coherence Score (U_Mass): -0.42236587843071566 Number of topics: 50 Coherence Score (U_Mass): -0.5597070191943092 Number of topics: 100 Coherence Score (U_Mass): -0.6293096290358676 --------------------------------------------------------------------------
def coherence(self, corpus): coherence_model = CoherenceModel(model=self.lda, texts=corpus.tokens, dictionary=corpus.dictionary, coherence='c_uci') return coherence_model.get_coherence()
print() print("[ %s ]Showing Topics:" % (time.asctime(time.localtime(time.time())))) for topic in good_lda.show_topics(): print(topic) print() for topic in bad_lda.show_topics(): print(topic) print() #pyLDAvis.enable_notebook() good_vis = pyLDAvis.gensim.prepare(good_lda, corpus, dictionary) pyLDAvis.save_html(good_vis, './html/good_lda_' + str(args.topics) + '.html') good_cm = CoherenceModel(model=good_lda, corpus=corpus, dictionary=dictionary, coherence='u_mass') print("good_lda u_mass : ", good_cm.get_coherence()) good_cm = CoherenceModel(model=good_lda, texts=texts, dictionary=dictionary, coherence='c_v') print("good_lda c_v : ", good_cm.get_coherence()) bad_vis = pyLDAvis.gensim.prepare(bad_lda, corpus, dictionary) pyLDAvis.save_html(bad_vis, './html/bad_lda_' + str(args.topics) + '.html') bad_cm = CoherenceModel(model=bad_lda, corpus=corpus, dictionary=dictionary, coherence='u_mass') print("bad_lda u_mass : ", bad_cm.get_coherence())
def train_model(self, topic_docs, num_topics, model_name, blnSaveinDB=False, blnSaveTrainedModelFiles=False, txtFileName=None, model_type='both', lda_num_of_iterations=150, delete_stop_words=True, lemmatize_words=True, delete_numbers=True): #starttime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #print("Executing train_model... Started at: " + starttime ) doc_clean = [self.clean_docs(doc, delete_numbers, delete_stop_words, lemmatize_words).split() for doc in topic_docs] # Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) self.dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. self.doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in doc_clean] # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel file_data = [] if model_type in ('lda', 'both'): # Build the LDA model self.lda_model = gensim.models.LdaModel(corpus=self.doc_term_matrix, num_topics=num_topics, id2word=self.dictionary, iterations=lda_num_of_iterations) #get LDA coherence self.lda_coh_u_mass = CoherenceModel(model=self.lda_model, corpus=self.doc_term_matrix, dictionary=self.dictionary, coherence='u_mass') self.lda_coh_c_v = CoherenceModel(model=self.lda_model, texts=doc_clean, dictionary=self.dictionary, coherence='c_v') #create json file with lda results for idx in range(num_topics): topic = idx+1 strtopic = str(topic) data = '{"model_name":"' + model_name + \ '", "model_type":"' + 'lda' + \ '", "timestamp":"' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \ '", "no_tweets":"' + str(len(topic_docs)) + \ '", "coh_u_mass":"' + str(self.lda_coh_u_mass.get_coherence()) + \ '", "coh_c_v":"' + str(self.lda_coh_c_v.get_coherence()) + \ '", "topic_no":"' + strtopic + \ '", "topic":"' + str(self.lda_model.print_topic(idx, num_topics)).replace('"', "-") + '"}' x = json.loads(data) file_data.append(x) if model_type in ('lsi', 'both'): # Build the LSI model self.lsi_model = gensim.models.LsiModel(corpus=self.doc_term_matrix, num_topics=num_topics, id2word=self.dictionary) #get LSI coherence self.lsi_coh_u_mass = CoherenceModel(model=self.lsi_model, corpus=self.doc_term_matrix, dictionary=self.dictionary, coherence='u_mass') self.lsi_coh_c_v = CoherenceModel(model=self.lsi_model, texts=doc_clean, dictionary=self.dictionary, coherence='c_v') #create json file with lsi results for idx in range(num_topics): topic = idx+1 strtopic = str(topic) data = '{"model_name":"' + model_name + \ '", "model_type":"' + 'lsi' + \ '", "timestamp":"' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \ '", "no_tweets":"' + str(len(topic_docs)) + \ '", "coh_u_mass":"' + str(self.lsi_coh_u_mass.get_coherence()) + \ '", "coh_c_v":"' + str(self.lsi_coh_c_v.get_coherence()) + \ '", "topic_no":"' + strtopic + \ '", "topic":"' + str(self.lsi_model.print_topic(idx, num_topics)).replace('"', "-") + '"}' x = json.loads(data) file_data.append(x) # Save if mongoDB collection is asked if blnSaveinDB == True: if self.db is not None: self.c_topics.insert_many(file_data) else: print("Can't save topics in db. No mongoDB connection was set up.") # Save results in a text file if txtFileName is not None: with open(txtFileName, 'w', encoding="utf-8") as outfile: json.dump(file_data, outfile) # Save models into file if blnSaveTrainedModelFiles == True: #creates path if does not exists if not os.path.exists(self.folder_path + "/trained_models/"): os.makedirs(self.folder_path + "/trained_models/") self.lda_model.save(self.folder_path + "/trained_models/" + model_name + "_lda_model.model") self.dictionary.save(self.folder_path + "/trained_models/" + model_name + "_dictionary.dict")