def top_topics(self, corpus, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1): """Get the topics sorted by coherence. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). texts : list of list of str, optional Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) probability estimator . dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Dictionary mapping of id word to create corpus. If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used. window_size : int, optional Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10. coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional Coherence measure to be used. Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`. For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed) topn : int, optional Integer corresponding to the number of top words to be extracted from each topic. processes : int, optional Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as num_cpus - 1. Returns ------- list of (list of (int, str), float) Each element in the list is a pair of a topic representation and its coherence score. Topic representations are distributions of words, represented as a list of pairs of word IDs and their probabilities. """ cm = CoherenceModel( model=self, corpus=corpus, texts=texts, dictionary=dictionary, window_size=window_size, coherence=coherence, topn=topn, processes=processes ) coherence_scores = cm.get_coherence_per_topic() str_topics = [] for topic in self.get_topics(): # topic = array of vocab_size floats, one per term bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token str_topics.append(beststr) # list of topn (float membership, token) tuples scored_topics = zip(str_topics, coherence_scores) return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
#print(i,row) for j,(topic_num,prop_topic) in enumerate(row): w.write(str(prop_topic)) w.write(',') w.write('\b\n') print('\nNumer of topic : ', num_topic ) print('\nCoherence Score: ', np.round(coherence,4)) ''' # RUNNING MODEL WITH MALLET mallet_path = 'mallet-2.0.8/bin/mallet' model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=id2word, workers = 2, random_seed = 0) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) print('--------------------3') #optimal_model = model #model_topics = optimal_model.show_topics(formatted=False) ldamodel=model with open ('am_LDA_wmallet_metadata.csv', 'w') as w: for i,row in enumerate(ldamodel[corpus]): w.write(asin[i]) w.write(',') #print(i,row) for j,(topic_num,prop_topic) in enumerate(row): w.write(str(prop_topic))
doc_lda = lda_model[corpus] # In[44]: # Model perplexity and topic coherence provide a convenient # measure to judge how good a given topic model is. # Compute Perplexity print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # In[46]: # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # In[48]: # Visualize the topics pyLDAvis.enable_notebook(sort=True) vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) # In[49]: pyLDAvis.display(vis) # In[109]:
# Check resulting topics. listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics, num_words=15) for index, i in enumerate(listOfTopics): string = str(i[1]) for c in "0123456789+*\".": string = string.replace(c, "") string = string.replace(" ", " ") print(string) # calculate & display perplexity print('\nPerplexity: ', ldaModel.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # calculate & display coherence coherenceModel = CoherenceModel(model=ldaModel, texts=document, dictionary=dictionary, coherence='c_v') ldaCoherence = coherenceModel.get_coherence() print('\nCoherence Score: ', ldaCoherence) # assign a file name based on the loop number so that models aren't overridden during successive iterations. path = './models/both/nouns_only' if not os.path.exists(path): os.makedirs(path) ldaModel.save(f'./models/both/nouns_only/model1-{loopNum}.model') # save the model dataframe for use in later sections. modelDataframe.to_csv('./dataframes/model_df.csv')
def most_similar_texts(self, X, num_examples, text_column_name, num_topics=None): """ Uses NMF clustering to create n topics based on adjusted word frequencies Parameters -------- X: DataFrame num_examples: int text_column_name: str num_topics: int Optional - if none algorithm will determine best number Returns -------- topic_words_df: DataFrame Top 15 words/phrases per topic combined_df: DataFrame Original text with topic number assigned to each """ X = X[~X[text_column_name].isna()] X = X[X[text_column_name] != ""] X = X[X[text_column_name] != " "] X = X[X[text_column_name] != "NA"] X = X[X[text_column_name] != "n/a"] X = X[X[text_column_name] != "N/A"] X = X[X[text_column_name] != "na"] all_stop_words = (set(ENGLISH_STOP_WORDS) | set(["-PRON-"]) | set(string.punctuation) | set([" "])) ct = CleanText() vectorizer = TfidfVectorizer( tokenizer=ct.lematize, ngram_range=(1, 3), stop_words=all_stop_words, min_df=5, max_df=0.4, ) vectors = vectorizer.fit_transform(X[text_column_name]).todense() # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later) vocab = vectorizer.get_feature_names() vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index) if X.shape[0] < 20: return "Too few examples to categorize." if not num_topics: # In case 1, add 1 to get at least 2 # The rest are based on eyeballing numbers min_topics = ceil(X.shape[0] * 0.01) + 1 max_topics = ceil(X.shape[0] * 0.2) step = ceil((max_topics - min_topics) / 5) topic_nums = list(np.arange(min_topics, max_topics, step)) texts = X[text_column_name].apply(ct.lematize) # In gensim a dictionary is a mapping between words and their integer id dictionary = Dictionary(texts) # Filter out extremes to limit the number of features dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000) # Create the bag-of-words format (list of (token_id, token_count)) corpus = [dictionary.doc2bow(text) for text in texts] coherence_scores = [] for num in topic_nums: model = nmf.Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize=2000, passes=5, kappa=0.1, minimum_probability=0.01, w_max_iter=300, w_stop_condition=0.0001, h_max_iter=100, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=42, ) cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence="u_mass") coherence_scores.append(round(cm.get_coherence(), 5)) scores = list(zip(topic_nums, coherence_scores)) chosen_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0] else: chosen_num_topics = num_topics model = NMF(n_components=chosen_num_topics, random_state=42) model.fit(vectors) component_loadings = model.transform(vectors) top_topics = pd.DataFrame(np.argmax(component_loadings, axis=1), columns=["top_topic_num"]) top_topic_loading = pd.DataFrame(np.max(component_loadings, axis=1), columns=["top_topic_loading"]) X.reset_index(inplace=True, drop=False) vector_df.reset_index(inplace=True, drop=True) # Fix for duplicate text_column_name vector_df.columns = [x + "_vector" for x in vector_df.columns] combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading], axis=1) combined_df.sort_values(by="top_topic_loading", ascending=False, inplace=True) combined_df = pd.concat([X, vector_df, top_topics], axis=1) topic_words = {} sample_texts_lst = [] for topic, comp in enumerate(model.components_): word_idx = np.argsort(comp)[::-1][:num_examples] topic_words[topic] = [vocab[i] for i in word_idx] sample_texts_lst.append( list(combined_df[combined_df["top_topic_num"] == topic] [text_column_name].values[:num_examples])) topic_words_df = pd.DataFrame(columns=[ "topic_num", "num_in_category", "top_words_and_phrases", "sample_texts", ]) topic_words_df["topic_num"] = [k for k, _ in topic_words.items()] topic_words_df["num_in_category"] = ( combined_df.groupby("top_topic_num").count().iloc[:, 0]) topic_words_df["top_words_and_phrases"] = [ x for x in topic_words.values() ] topic_words_df["sample_texts"] = sample_texts_lst topic_words_explode = pd.DataFrame( topic_words_df["sample_texts"].tolist(), index=topic_words_df.index, ) topic_words_explode.columns = [ "example{}".format(num) for num in range(len(topic_words_explode.columns)) ] concated_topics = pd.concat( [ topic_words_df[[ "topic_num", "num_in_category", "top_words_and_phrases" ]], topic_words_explode, ], axis=1, ) print("Topics created with top words & example texts:") print(concated_topics) return ( concated_topics, combined_df[["index", text_column_name, "top_topic_num"]], )
# ### Evaluate - model #1 # In[91]: # calculate perplexity metrics perplexity = model_lda.log_perplexity(corpus_train) perplexity # In[92]: # TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param # calculate coherence metric coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v') coherence_1 = coherence.get_coherence() coherence_1 # In[94]: # calculate coherence metric or each of the n topicss coherence_1 = coherence.get_coherence_per_topic() coherence_1 # In[97]:
# Running and Trainign LDA model on the document term matrix. lda_model = Lda(doc_term_matrix, num_topics=num_topics, id2word=dictionary, random_state=500, passes=passes) #pprint(lda_model.print_topics(num_words=30)) print("--------------- TOPICs Using LDA------------------------------") for i, topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=30): print(str(i) + ": " + topic) print() print( "----------------Perplexity and Coherence Score for LDA----------------------------" ) print('\nPerplexity: ', lda_model.log_perplexity( doc_term_matrix)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=all_data, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda)
passes = 10, eval_every=1, workers = None) lda_model.print_topics() for idx, topic in lda_model.print_topics(-1): print("Category: {} \nWords: {}".format(idx, topic )) print("\n") # Compute Perplexity print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus_dict)) #Perplexity: -6.83 # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dict_corpus, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) #Coherence Score: 0.4264283394676994 def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics
def getCoherence(self): coherenceModel = CoherenceModel(model=self.model, texts=self.data, dictionary=self.id2word, coherence='c_v') return coherenceModel.get_coherence()
for n, topic in lm.show_topics(num_topics=-1, formatted=False): topic = [word for word, _ in topic] cm = CoherenceModel(topics=[topic], texts=texts, dictionary=dictionary, window_size=10) coherence_values[n] = cm.get_coherence() top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True) return lm, top_topics lm, top_topics = ret_top_model() print(top_topics[:5]) pprint([lm.show_topic(topicid) for topicid, c_v in top_topics[:10]]) ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() def evaluate_bar_graph(coherences, indices): """ Function to plot bar graph. coherences: list of coherence values indices: Indices to be used to mark bars. Length of this and coherences should be equal. """ assert len(coherences) == len(indices) n = len(coherences) x = np.arange(n) plt.bar(x, coherences, width=0.2, tick_label=indices, align='center') plt.xlabel('Models') plt.ylabel('Coherence Value')
def main(): # Get a list of all tweet texts from MongoDB #---------------------------------- print('\nLoading from MongoDB..') cursor = collection.find({"tweet_date":"2020-12-08"}) data = [] for doc in cursor: data.append(doc["text_preprocessed"]) #print(doc["text_preprocessed"]) #print(text_data) #---------------------------------- # Create a dictionary # ---------------------------------- print('\nCreating dictionary..') data = [d.split() for d in data] dictionary = gensim.corpora.Dictionary(data) #print(len(id2word)) dictionary.filter_extremes(no_below=2, no_above=.99) # Filtering Extremes #print(len(id2word)) # ---------------------------------- # Creating a corpus object # ---------------------------------- print('\nCreating corpus..') corpus = [dictionary.doc2bow(d) for d in data] # ---------------------------------- # LDA model # ---------------------------------- print('\nBuilding LDA model..') LDA_model = gensim.models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=5) # Instantiating a Base LDA model # ---------------------------------- # Create Topics # ---------------------------------- print('\nTopics:') words = [re.findall(r'"([^"]*)"', t[1]) for t in LDA_model.print_topics()] # Filtering for words topics = [' '.join(t[0:10]) for t in words] for id, t in enumerate(topics): # Getting the topics print(f"------ Topic {id} ------") print(t, end="\n\n") # ---------------------------------- # Print topics with propabilities # ---------------------------------- print('\nTopics with propabilities:') for i in LDA_model.print_topics(): for j in i: print(j) # ---------------------------------- # Get most frequent words of each topic # ---------------------------------- print('\nMost frequent words by topic:') topic_words = [] for i in range(NUM_TOPICS): tt = LDA_model.get_topic_terms(i, 20) topic_words.append([dictionary[pair[0]] for pair in tt]) # output for i in range(NUM_TOPICS): print(f"\n------ Topic {i} ------") print(topic_words[i]) # ---------------------------------- # Compute Coherence and Perplexity # ---------------------------------- #Compute Perplexity, a measure of how good the model is. lower the better print('\nComputing Coherence and Perplexity..') base_perplexity = LDA_model.log_perplexity(corpus) print('\nPerplexity: ', base_perplexity) # Compute Coherence Score coherence_model = CoherenceModel(model=LDA_model, texts=data, dictionary=dictionary, coherence='c_v') coherence_lda_model_base = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_lda_model_base) # ---------------------------------- # Creating Topic Distance Visualization # ---------------------------------- print('\nCreating visualization..') visualisation = pyLDAvis.gensim.prepare(LDA_model, corpus, dictionary) pyLDAvis.save_html(visualisation, 'LDA_Visualization.html')
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return(sent_topics_df) def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations # Define functions for stopwords, bigrams, trigrams and lemmatization def remove_stopwords(texts): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out bfinputdir = '/home/mike/QA_Project/lda_Data/Bug_Fix_sentiment_Data/' biinputdir = '/home/mike/QA_Project/lda_Data/Bug_intro_sentiment_Data/' bfout = '/home/mike/QA_Project/lda_Data/Bug_Fix_topic_output/' biout = '/home/mike/QA_Project/lda_Data/Bug_intro_topic_output/' bf_list= [] bi_list= [] total_list= [] for file in os.listdir(bfinputdir): file with open (bfinputdir+file, "r", encoding="utf8") as f: list1= [] name= file.split('.')[0] new_file = open(bfout+name+'_Lda1.txt','w') for line in f: line=line.rstrip() list1.append(line) bf_list.append(line) total_list.append(line) # Remove Emails list1 = [re.sub('\S*@\S*\s?', '', sent) for sent in list1] # Remove new line characters list1 = [re.sub('\s+', ' ', sent) for sent in list1] # Remove distracting single quotes list1 = [re.sub("\'", "", sent) for sent in list1] pprint(list1[:1]) data_words = list(sent_to_words(list1)) print(data_words[:1]) # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # See trigram example print(trigram_mod[bigram_mod[data_words[0]]]) # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print(data_lemmatized[:1]) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View print(corpus[:1]) # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] # Compute Perplexity print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Visualize the topics pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) vis df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data) # Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] # Show df_dominant_topic.head(10) # Number of Documents for Each Topic topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts() # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 4) # Topic Number and Keywords topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']] # Concatenate Column wise df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) # Change Column names df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents'] # Show df_dominant_topics
from model.util.file_parser import parse_dir_json if __name__ == '__main__': init_logger() log = logging.getLogger('lda_model') config = LdaConfig(sys.argv[1], 'lda_model').get_current_config() _, docs = zip(*parse_dir_json(config['data_path'])) preprocessed_docs = Preprocessor( max_workers=config['max_workers']).process_docs(docs) log.info("Loading model from %s", config['model_path']) lda_model = LdaMulticore.load(config['model_path']) log.info("Loading dictionary from %s", config['dict_path']) dictionary = Dictionary.load(config['dict_path']) coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() import csv with open(config['coherence_path'], "a") as csv_file: writer = csv.writer(csv_file, delimiter=';') writer.writerow([config['topics'], coherence_lda])
def abandon(): stopWords = set(stopwords.words('english')) for w in string.punctuation: stopWords.add(w) stops_words = [ "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get", "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know", "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like", "via", "u", "would", "still", "first", "really", "watch", "see", "even", "that's", "look", "way", "last", "said", "let", "twitter", "ever", "always", "another", "many", "things", "may", "big", "come", "keep", "5", "time", "much", "want", "think", "us", "love", "people", "need" ] for w in stops_words: stopWords.add(w) tokenizer = CustomTweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False, normalize_usernames=False, normalize_urls=True, keep_allupper=False) cnt = Counter() texts = [] # comm = json.load(open("data/louvain_rst.json")) # users_comm = {str(u) for u in comm if comm[u] == 0} # print(len(users_comm)) # loading data data = pd.read_csv("data/ira-tweets-ele.csv", usecols=["tweet_text", "userid"]) for i, row in tqdm(data.iterrows()): # if row["userid"] not in users_comm: # continue words = tokenizer.tokenize(row["tweet_text"]) words = [w for w in words if w not in stopWords and w] # if words[0] == "RT": # continue for w in words: cnt[w] += 1 texts.append(words) print(len(texts)) json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(t) for t in texts] def average_distance(v_tops): _sum = 0 _cnt = 0 for i in range(len(v_tops)): for j in range(i + 1, len(v_tops)): _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j]) _cnt += 1 return _sum / _cnt with open("data/IRA_topics.txt", "w") as f: for n in range(2, 12): print(f"N = {n}") lda = LdaModel(corpus, num_topics=n, random_state=42) v_topics = lda.get_topics() lda.save(f"model/lda-ira-{n}.mod") # pprint(lda.print_topics()) f.write(f"Perplexity: {lda.log_perplexity(corpus)}" ) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda, texts=corpus, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() f.write(f"Coherence Score: {coherence_lda}") f.write(f"~Average distance: {average_distance(v_topics)}\n") # show x = lda.show_topics(num_topics=n, num_words=20, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] dictionary.id2token = { v: k for k, v in dictionary.token2id.items() } # Below Code Prints Topics and Words for topic, words in topics_words: f.write( str(topic) + " :: " + str([dictionary.id2token[int(w)] for w in words]) + "\n") f.write("\n")
import joblib from gensim.models import CoherenceModel #load the model lda_model = joblib.load('62topiclda.pkl') #load the dictionary dictionary = joblib.load('dictionary.pkl') #load the corpus bow_corpus = joblib.load('bow_corpus.pkl') if __name__ == "__main__": # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=bow_corpus.tolist(), dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda)
# MANUAL APPROACH, CoherenceModel below does the same, but only provides the aggregated values output = model.top_topics(corpus=transtfidf, texts=minutes, coherence='u_mass', topn=20) topicScores = [item[1] for item in output] avgScore = 0 for score in topicScores: avgScore += score avgScoreArr.append(avgScore/num_topics) topicScoreArr.append(topicScores) print(avgScore, avgScore/num_topics) print("Starting to apply coherence model") cm = CoherenceModel( model=model, corpus=transtfidf, texts=minutes, dictionary=dct, coherence='u_mass' ) coherenceScoreAlt.append(round(cm.get_coherence(), 5)) # print("Finished using coherence model, next iteration") pickle.dump(topicScoreArr, open("coherenceDump", "wb")) pickle.dump(coherenceScoreAlt, open("coherenceDumpAlt", "wb")) # exit() coherenceScore = pickle.load(open("coherenceDump", "rb")) coherenceScoreAlt = pickle.load(open("coherenceDumpAlt", "rb"))
# In[31]: pprint(ldamodel.print_topics()) doc_lda = ldamodel[doc_term_matrix] # In[32]: # Compute Perplexity print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix)) # a measure of how good the model is. lower the better. # In[33]: # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # ### 4.5 Visualize the topics # # *** # In[35]: pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) vis # In[36]:
model_list = [] data_list = [] dict_list = [] with open(file_name, 'rb') as f: while True: try: iteration, model, time_arr, data, id2word, _ = pickle.load(f) model_list.append(model) data_list.append(data) dict_list.append(id2word) except: break coherence_list = [] count = 0 for i in range(0, len(model_list)): model = model_list[i] data = data_list[i] id2word = dict_list[i] print(id2word) count += 1 print('Iteration ' + str(count)) coherencemodel = CoherenceModel(model=model, texts=data, dictionary=id2word, coherence='c_v') coherence_list.append(coherencemodel.get_coherence()) with open(output, 'wb') as f: pickle.dump((coherence_list, time_arr), f)
pickle_loc = lambda t: "lda_ml_pickles/lda_mp_{}_topics_{}_songs.pickle".format( t, len(texts)) min_topics = 3 max_topics = 100 topics_to_coherence = {} for topics in range(min_topics, max_topics + 1): lda_model_dist = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=topics, random_state=100, chunksize=100, passes=10, alpha='symmetric', per_word_topics=True) coherence_model_lda_dist = CoherenceModel(model=lda_model_dist, texts=texts, dictionary=id2word, coherence='c_v') coherence_lda_dist = coherence_model_lda_dist.get_coherence() topics_to_coherence[topics] = coherence_lda_dist pickle.dump(lda_model_dist, open(pickle_loc(topics), 'wb')) pickle.dump(topics_to_coherence, open("lda_ml_pickles/topics_to_coherence.pickle", "wb")) print("Done with {} topics using {} song records!".format( topics, len(texts))) print("\n\nwhew, all done! :)")
# Show df3_dominant_topic.head(10) # In[46]: lda_model.log_perplexity(doc_term_matrix) #Perplexoity, lower the better # In[47]: from gensim.models import CoherenceModel coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenised_corpus, dictionary=dictionary,coherence="c_v") coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # In[48]: def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus
def runlda(rawdata): # Convert to list data = [] data.extend(tokenize.sent_tokenize(rawdata)) print(data) def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations # Tokenize into words print('Tokenizing') data_words = list(sent_to_words(data)) # Build the bigram and trigram models print('Creating bigrams and trigrams') bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram print('Building bigram and trigram models') bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Define functions for stopwords, bigrams, trigrams and lemmatization def remove_stopwords(texts): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc])# if token.pos_ in allowed_postags]) return texts_out # Remove Stop Words print('Removing stopwords') data_words_nostops = remove_stopwords(data_words) # Form Bigrams print('Forming bigrams') data_words_bigrams = make_bigrams(data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv print('Lemmatizing') data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary print('Creating dictionary') id2word = corpora.Dictionary(data_lemmatized) # Create Corpus print('Creating corpus') texts = data_lemmatized # Term Document Frequency print('Creating term frequency list') corpus = [id2word.doc2bow(text) for text in texts] cwd = os.getcwd() mallet_path = cwd + '/mallet-2.0.8/bin/mallet' # update this path ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word) def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(start, limit, step): print('Calculating {}-topic model'.format(num_topics)) model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values # Can take a long time to run. limit=5; start=4; step=1; model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=start, limit=limit, step=step) # Print the coherence scores x = range(start, limit, step) for m, cv in zip(x, coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 6)) # Select the model and print the topics index, value = max(enumerate(coherence_values), key=operator.itemgetter(1)) print(index) optimal_model = model_list[index] model_topics = optimal_model.show_topics(num_topics=1000, formatted=False) # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Compute Perplexity print ('Perplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print ('Coherence Score: ', coherence_lda) def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return(sent_topics_df) print('Verify topics') df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data) # Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] print('Format key words display') # Number of Documents for Each Topic topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts() print('Topic count') # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 4) print('Topic contribution') # Group top 5 sentences under each topic sent_topics_sorteddf_mallet = pd.DataFrame() sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"] # Show sent_topics_sorteddf_mallet # Topic Number and Keywords print('Add items') topic_num_keywords = sent_topics_sorteddf_mallet[['Topic_Num', 'Keywords']] print('Topic number and keywords') # Concatenate Column wise df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) print('Concatenate column') # Change Column names df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Percent_Documents'] print('Change column names') return df_dominant_topics.to_json()
max_words=5, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = ldamodel.show_topics(formatted=False) fig, axes = plt.subplots(1, 2, figsize=(10,10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show() pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary=ldamodel.id2word, mds='mmds') vis coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts_lem, dictionary=dictionary1, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda)
# In[18]: lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] # In[19]: lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() # In[20]:
# View print(corpus[:1]) # Human readable format of corpus (term-frequency) [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]] tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model tfidf_corpus = tfidf[corpus] lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=300) coherence_model = CoherenceModel(model=lsi_model, texts=texts, dictionary=id2word, coherence='c_v') coherence = coherence_model.get_coherence() print('\nCoherence Score: ', coherence) df_topic_sents_keywords = model_visualization.format_topics_sentences( lsi_model, corpus, texts) df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = [ 'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text' ] # to get doc topics df_dominant_topic['Keywords'][doc_num] # to get doc topics dominant quality df_dominant_topic['Dominant_Topic'][doc_num]
def get_coherence(model, text, dict): coherence_model = CoherenceModel(model=model, texts=text, dictionary=id2word, coherence='c_v') return coherence_model.get_coherence()
texts = clean_text_list corpus = [id2word.doc2bow(text) for text in texts] best_coh_score = 0 best_topics = 0 for i in range(8, 30): lda_model_loop = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=i, random_state=100, update_every=1, chunksize=100, passes=20, alpha='auto', per_word_topics=True) coherence_model_lda = CoherenceModel(model=lda_model_loop, texts=texts, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() if coherence_lda > best_coh_score: best_coh_score = coherence_lda best_topics = i print('Topics:', i) print('Conherence score: ', coherence_lda) print(best_coh_score) print(best_topics) lda_model_best = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=best_topics, random_state=100, update_every=1, chunksize=100,
wandb.init(config=config, project="topical_language_generation_sweeps") #data preparation cached_dir = "/home/rohola/cached_models" tokenizer = TransformerGPT2Tokenizer(cached_dir) dataset = TopicalDataset(config.dataset_dir, tokenizer) docs = [doc for doc in dataset] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=config.no_below, no_above=config.no_above) corpus = [dictionary.doc2bow(doc) for doc in docs] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi_model = LsiModel( corpus_tfidf, id2word=dictionary, num_topics=config.num_topics, ) #cm = CoherenceModel(model=lsi_model, corpus=corpus, coherence='u_mass') cm = CoherenceModel(model=lsi_model, texts=docs, dictionary=dictionary, coherence='c_w2v') # coherence = cm.get_coherence() # print("coherence: ", coherence) wandb.log({"coherence": cm.get_coherence()})
lda_model = LdaModel(doc_term_matrix, num_topics=5, id2word=dictionary, iterations=10, random_state=2) # extract topics for headlines topics = lda_model.print_topics(num_topics=5, num_words=10) # pprint topics print(topics) # Code ends here # -------------- # coherence score coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_headlines, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters:
word_dict = {}; for i in range(num_topics): words = model.show_topic(i, topn = 20); word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]; return pd.DataFrame(word_dict); get_topics(lda, num_topics) pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, corpus, id2word) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)] lsi_coherence = CoherenceModel(model=lsimodel,topics=lsitopics,dictionary=id2word, texts=train_headlines,window_size=10).get_coherence() lda_coherence = CoherenceModel(model=lda,topics=ldatopics,dictionary=id2word,texts=train_headlines,window_size=10).get_coherence() #lda_coherence =CoherenceModel(model=lsimodel, corpus=corpus, coherence='u_mass').get_coherence() def evaluate_bar_graph(coherences, indices): """ Function to plot bar graph. coherences: list of coherence values indices: Indices to be used to mark bars. Length of this and coherences should be equal. """ assert len(coherences) == len(indices) n = len(coherences) print(coherences) x = np.arange(n)
def compute_coherence_score(): coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(start, limit, step): # Build LDA model model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # model = gensim.models.wrappers.LdaMallet(lda_model, corpus=corpus, num_topics=num_topics, id2word=id2word) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') print(coherencemodel.get_coherence()) coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values # Can take a long time to run. model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6) # Show graph limit = 40; start = 2; step = 6; x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() print("LDA Gensim Printing") # Print the coherence scores for m, cv in zip(x, coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
def trainlda(self, topics_n = 10): self.num_topics = topics_n alltexts = [] for name,sentences in self.user_sentences.items(): sentences = [item for sublist in sentences for item in sublist] alltexts.append(sentences) # if self.ngram_dictionary == None: # if self.ngram == 1: # self.ngram_dictionary = Dictionary(self.all_sentences) # elif self.ngram == 2: # self.ngram_dictionary = Dictionary(self.all_bigram_sentences) # if self.ngram_dictionary == None: if self.ngram == 1: self.ngram_dictionary = Dictionary(alltexts) elif self.ngram == 2: self.ngram_dictionary = Dictionary(alltexts) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8) self.ngram_dictionary.compactify() # if self.ngram == 1: # sentences = self.all_sentences # elif self.ngram == 2: # sentences = self.all_bigram_sentences # ngram_bow_corpus = [] # for sentence in sentences: # ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) # # # self.lda = LdaMulticore(ngram_bow_corpus, # num_topics = topics_n, # id2word=self.ngram_dictionary, # workers=3) ngram_bow_corpus = [] for sentence in alltexts: ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) self.lda = LdaMulticore(ngram_bow_corpus, num_topics = topics_n, id2word=self.ngram_dictionary, workers=3) # calculate the cohe topics=[] for i in range(self.lda.num_topics): terms = [] for n in self.lda.show_topic(i): terms.append(n[0]) topics.append(terms) cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass') cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v') cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci') cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi') return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()