def compute_values(self, kmin, kmax, kstep): # vectorize doc vec = CountVectorizer() X = vec.fit_transform(self.docs) # get vocabulary and biterms from docs vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) # create a BTM and pass the biterms to train it btm = oBTM(num_topics = 20, V = vocab) topics = btm.fit_transform(biterms, iterations=100) topic_summuary(btm.phi_wz.T, X, vocab, 10)
def biterm_topic_model_topic_extraction(): """ Function performs topic extraction on Tweets using the Gensim HDP model. :return: None. """ # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.") log.info(f"{tf}\n") log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name") log.info(f"{tf_feature_names}\n") # Convert corpus of documents (vectorized text) to numpy array. tf_array = tf.toarray() # Convert dictionary of words (vocabulary) to numpy array. tf_feature_names = np.array(tf_vectorizer.get_feature_names()) # get biterms biterms = vec_to_biterms(tf_array) # create btm btm = oBTM(num_topics=20, V=tf_feature_names) print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] btm.fit(biterms_chunk, iterations=50) topics = btm.transform(biterms) time.sleep(3) # print("\n\n Visualize Topics ..") # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0)) # pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10) print("\n\n Texts & Topics ..") for i in range(1, 10): print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))
def main(start, end, increment): path = Path('C:/Data/Python/JobLoss') data_words = [] with open(path / 'Processed.json') as f: data = json.load(f) for tweet in data: data_words.append(' '.join(tweet[1])) vec = CountVectorizer() X = vec.fit_transform(data_words).toarray() vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) for k in range(start, end, increment): print('Model %s' % k) btm = oBTM(num_topics=k, V=vocab) for i in range(0, len(biterms), chunksize): print('%s / %s' % (i, len(biterms))) biterms_chunk = biterms[i:i + chunksize] btm.fit(biterms_chunk, iterations=iterations) topics = btm.transform(biterms) vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) pyLDAvis.save_html( vis, str(path / ('Visualizations/BTMVisualization%s.html' % k)))
from sklearn.feature_extraction.text import CountVectorizer from biterm.utility import vec_to_biterms, topic_summuary if __name__ == "__main__": texts = open('./data/reuters.titles').read().splitlines()[:50] # vectorize texts vec = CountVectorizer(stop_words='english') X = vec.fit_transform(texts).toarray() # get vocabulary vocab = np.array(vec.get_feature_names()) # get biterms biterms = vec_to_biterms(X) # create btm btm = oBTM(num_topics=20, V=vocab) print("\n\n Train BTM ..") topics = btm.fit_transform(biterms, iterations=100) # print("\n\n Visualize Topics ..") # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) # pyLDAvis.save_html(vis, './vis/simple_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10) print("\n\n Texts & Topics ..")
def estimate_BTM(fpath, arr): #Read in the data: Below line of code will need to be reconfigured for your filepath company_data = pd.read_excel(fpath) company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM estimation process for %s" % company_name) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets = company_data[~filter1].copy() #company_tweets2 = company_data[~filter1].copy() #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Perform standardization on the textual contents of the company's tweets: #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves #def standardize_text(df, text_field): # df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word # df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and' # df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing # df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed # df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing # df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space # df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at' # df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase # #remove double spaces and replace with single space # df[text_field] = df[text_field].str.replace(r"\s+", " ") # return df textual_tweets = standardize_text(company_tweets, "Content") #Examine tweets after standardization has been performed: #print(textual_tweets["Content"].head(5)) #Perform lemmatization on the textual contents of the tweets: ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/ #from textblob import TextBlob, Word #def lem_with_postag(df, text_field): # tag_dict = {"J": 'a', # "N": 'n', # "V": 'v', # "R": 'r'} # output = [] # for tweet in df[text_field]: # sent = TextBlob(tweet) # words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags] # lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] # lemTweet = " ".join(lemmatized_list) # output.append(lemTweet) # return output textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") #from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words=stop_words) ##Filter out tweets w/ less than 3 words after stop word removal: #def clean_tokenize(df, text_field, stop_set): # output = [] # for tweet in df[text_field]: # clean_toks = [] # for tok in tweet: # if tok not in stop_set: # clean_toks.append(tok) # output.append(clean_toks) # return output #from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]] cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy() #Extract the remaining textual contents of tweets: #clean_tokens = cleanGlish_tweets2["clean_tokens"] #Doesn't hurt to examine some of them: #print(cleanGlish_tweets2["clean_tokens"].head(5)) #print("Break") #x = vectorizer.fit_transform(clean_tokens) #x = vectorizer.fit_transform(cleanGlish_tweets2["clean_tokens"]) #x = vectorizer.fit_transform(str(clean_tokens)) #clean_tokens = [clean_tokens] #x = vectorizer.fit_transform(clean_tokens) #x = vectorizer.fit_transform(str(clean_tokens)) #x = vectorizer.fit_transform(cleanGlish_tweets2["clean_tokens"].str) cleanGlish_tweets2["clean_tokens"] = [" ".join(tok) for tok in cleanGlish_tweets2["clean_tokens"].values] #print(cleanGlish_tweets2["clean_tokens"].head(5)) #print("Break") clean_tweets = cleanGlish_tweets2["clean_tokens"] x = vectorizer.fit_transform(clean_tweets) #import matplotlib.pyplot as plt #from sklearn.cluster import KMeans #from sklearn.metrics import silhouette_score sum_squared_dists = [] km_silh = [] #Considering I have yet to see a best k greater than 13, I'm reducing K's range from (2, 21) to (2, 16) ##However, since this is lightning BTM, might as well revert back to K = range(2, 21) ##Might do that for an overnight run or something, reverting back to (2,16) for now ##Actually, might as well get this going K = range(2, 21) for k in K: km = KMeans(n_clusters=k, max_iter=200, n_init=10) km = km.fit(x) preds = km.predict(x) silh = silhouette_score(x, preds) sum_squared_dists.append(km.inertia_) km_silh.append(silh) plt.plot(K, sum_squared_dists, 'bx-') plt.xlabel('k') plt.ylabel('Sum of squared distances') plt.title('%s Elbow Method for Optimal k' % company_name) #plt.show() for i in range(len(K)): label = "{:.2f}".format(arr[i]) plt.annotate(label, (K[i], sum_squared_dists[i]), textcoords = "offset points", xytext = (3, 5), ha='center', fontsize=5) figpath2 = figpath + str(company_name) + 'elbow.png' plt.savefig(figpath2) ####################################################################### #See if silhouette scores are better for determining optimal k #from sklearn.preprocessing import MinMaxScaler #scaler = MinMaxScaler() #Actually, think this can all be done above as well plt.figure(figsize=(7,4)) plt.title("%s Silhouette Scores" % company_name) plt.scatter(x=[i for i in range(2,21)],y=km_silh,s=150,edgecolor='k') plt.grid(True) plt.xlabel("Number of clusters",fontsize=6) plt.ylabel("Silhouette score",fontsize=6) plt.xticks([i for i in range(2,21)],fontsize=8) plt.yticks(fontsize=8) #plt.show() for i in range(len(K)): label = "{:.2f}".format(arr[i]) plt.annotate(label, (K[i], km_silh[i]), textcoords = "offset points", xytext = (0, 9), ha='center', fontsize=6) figpath3 = figpath + str(company_name) + 'silhouetteScores.png' plt.savefig(figpath3) #plt.figure(figsize=(10,10)) #plt.title("%s Silhouette Scores" % company_name) #plt.xlabel('k') #plt.ylabel('Silhouette Score') #plt.scatter(x=rangemax(K)), y = km_silh) #plt.scatter(x=[i for i in range(2, np.max(K))], y = km_silh) #figpath4 = figpath + str(company_name) + '_silhouetteScoresAgain.png' #plt.savefig(figpath4) print("\nSilhouette scores:") for val in km_silh: print(val) #Function to calculate percent change in silhouette scores #Code derived from: https://stackoverflow.com/questions/30926840/how-to-check-change-between-two-values-in-percent #def get_change(current, previous): # if current == previous: # return 0 # try: # return ((current - previous) / previous) * 100.0 # except ZeroDivisionError: # return -1000 #Calculate percent changes: changes = [0] for i in range(len(km_silh) - 1): j = i + 1 change = get_change(km_silh[j], km_silh[i]) changes.append(change) #Examine percent changes: print("\nPercent changes:") for val in changes: print(val) #Determine which k values are suitable for testing: potential_k = [] for i in range(len(changes)): if changes[i] < 1 and i != 0: #if the silhouette score decreased, or only increased by less than 1% (and it's not the first obs, which always has 0% increase) k = i + 1 # + 1 instead of 2 because we want to grab the value before the decrease (or insignificant increase) potential_k.append(k) print("For %s, the k values to be tested are:" % company_name) print(potential_k) print("However, in this version, all potential k's 2-20 will be tested") ############################################################################################################################################## #BTM online training: #Bring in the vectorizer to be used for BTM and supply pre-defined stopwords #from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer(stop_words=stop_words) #Vectorize the tweets: X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) #Create a BTM and pass the biterms to train it, per k value in potential_k: #from biterm.btm import oBTM #import time best_k = [] best_coherence = [] #Function to perform online BTM training #def speedyBTM(num_top, vocabulary, b_terms): # btm = oBTM(num_topics=num_top, V=vocabulary) #create the btm object # start_time = time.time() # for i in range(0, len(b_terms), 100): #process chunks of 200 texts # biterms_chunk = biterms[i:i + 100] # btm.fit(biterms_chunk, iterations=50) # topics = btm.transform(biterms) # end_time = time.time() # run_time = end_time - start_time # print("For k = %s topics.." % num_top) # print("BTM online took %s seconds to train" % run_time) # #Examine topic coherence scores: # print("\nTopic Coherence:") # topic_summuary(btm.phi_wz.T, X, vocab, 10) total_start = time.time() #Train a BTM model on each potential k: (Not quite) #Train a BTM model on all potential k's 2-15: for k in K: lightningBTM(k, vocab, biterms, X) total_end = time.time() total_time = total_end - total_start print("For %s, total BTM estimation run-time was %s" % (company_name, total_time))
def estimate_BTM(fpath): #Read in the data: Below line of code will need to be reconfigured for your filepath company_data = pd.read_excel(fpath) company_name = company_data.iloc[ 0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM estimation process for %s" % company_name) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets2 = company_data[~filter1].copy() ##Mark tweets as OT or IRT: #Perform initial separation based on "^@" regex: initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]] initOT = [not elem for elem in initIRT] #print(initOT) #Create IRT and OT variables in the data: company_tweets2["IRT"] = initIRT company_tweets2["OT"] = initOT #Fill in NAs under the 'In Reply To' field with "OT": company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace( np.nan, "OT", regex=True) #print(company_tweets["In Reply To"].head(5)) #Clean up initial OT/IRT separation: def cleanSplit(tweets, text1, text2, text3, text4, text5): for i in range(len(tweets[text1])): if tweets.iloc[i, tweets.columns.get_loc( text2)] == True: #if the tweet was marked IRT initially if tweets.iloc[ i, tweets.columns.get_loc(text3)] == tweets.iloc[ i, tweets.columns. get_loc(text4)]: #but it's in reply to the company j = i #then index our current position so that we may examine the 'next' (technically previous) tweet while tweets.iloc[ j, tweets.columns.get_loc(text3)] == tweets.iloc[ j, tweets.columns.get_loc( text4)]: #while this continues to be true j = j + 1 #keep following the chain if tweets.iloc[j, tweets.columns.get_loc( text3 )] == "OT": #if an official tweet is at the end of the chain tweets.iat[ i, 19] = False #then the original tweet is part of an official thread, not true IRT tweets.iat[i, 20] = True #print(tweets.iloc[i, tweets.columns.get_loc(text1)]) return tweets company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT", "In Reply To", "Author", "OT") #For this version, extract official tweets only: company_tweets = company_tweets3[company_tweets3["OT"] == True].copy() #print(company_tweets.shape) #print("Break") #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace( r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace( r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace( r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Standardize the textual contents of tweets textual_tweets = standardize_text(company_tweets, "Content") #Lemmatize the textual contents of tweets: textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace( r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") #from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words=stop_words) #from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply( tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [ len(token) for token in cleanGlish_tweets["clean_tokens"] ] cleanGlish_tweets2 = cleanGlish_tweets[ cleanGlish_tweets["num_words"] >= 3].copy() #Extract the remaining textual contents of tweets: cleanGlish_tweets2["clean_tokens"] = [ " ".join(tok) for tok in cleanGlish_tweets2["clean_tokens"].values ] #print(cleanGlish_tweets2["clean_tokens"].head(5)) #print("Break") clean_tweets = cleanGlish_tweets2["clean_tokens"] x = vectorizer.fit_transform(clean_tweets) #import matplotlib.pyplot as plt #from sklearn.cluster import KMeans #from sklearn.metrics import silhouette_score sum_squared_dists = [] km_silh = [] #Set range of k values desired to be attempted: K = range(3, 21) for k in K: km = KMeans(n_clusters=k, max_iter=200, n_init=10) km = km.fit(x) preds = km.predict(x) silh = silhouette_score(x, preds) sum_squared_dists.append(km.inertia_) km_silh.append(silh) plt.plot(K, sum_squared_dists, 'bx-') plt.xlabel('k') plt.ylabel('Sum of squared distances') plt.title('%s Elbow Method for Optimal k' % company_name) #plt.show() figpath2 = figpath + str(company_name) + 'elbow.png' plt.savefig(figpath2) ####################################################################### #See if silhouette scores are better for determining optimal k plt.figure(figsize=(7, 4)) plt.title("%s Silhouette Scores" % company_name) plt.scatter(x=[i for i in range(3, 21)], y=km_silh, s=150, edgecolor='k') plt.grid(True) plt.xlabel("Number of clusters", fontsize=6) plt.ylabel("Silhouette score", fontsize=6) plt.xticks([i for i in range(3, 21)], fontsize=8) plt.yticks(fontsize=8) #plt.show() figpath3 = figpath + str(company_name) + 'silhouetteScores.png' plt.savefig(figpath3) print("\nSilhouette scores:") for val in km_silh: print(val) #Calculate percent changes: changes = [0] for i in range(len(km_silh) - 1): j = i + 1 change = get_change(km_silh[j], km_silh[i]) changes.append(change) #Examine percent changes: print("\nPercent changes:") for val in changes: print(val) #Determine which k values are suitable for testing: potential_k = [] for i in range(len(changes)): if changes[ i] < 1 and i != 0: #if the silhouette score decreased, or only increased by less than 1% (and it's not the first obs, which always has 0% increase) k = i + 1 # + 1 instead of 2 because we want to grab the value before the decrease (or insignificant increase) potential_k.append(k) print("For %s, the k values to be tested are:" % company_name) print(potential_k) print("However, in this version, all potential k's 2-20 will be tested") ############################################################################################################################################## #BTM online training: #Bring in the vectorizer to be used for BTM and supply pre-defined stopwords #from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer(stop_words=stop_words) #Vectorize the tweets: X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) #Create a BTM and pass the biterms to train it, per k value in potential_k: #from biterm.btm import oBTM #import time best_k = [] best_coherence = [] total_start = time.time() #Train a BTM model on each potential k: (Not quite) #Train a BTM model on all potential k's 2-15: for k in K: lightningBTM(k, vocab, biterms, X) total_end = time.time() total_time = total_end - total_start print("For %s, total BTM estimation run-time was %s" % (company_name, total_time))
def _analyze_texts(self): vec = CountVectorizer(stop_words='english') self.X = vec.fit_transform(self.documents).toarray() self.vocab = np.array(vec.get_feature_names()) self.biterms = vec_to_biterms(self.X)
df_cl = pd.DataFrame({ 'message_en': df['message_en'], 'cluster': km.predict(doc_term_matrix) }) # %% df_cl.query('cluster ==1') # %% # Biterm topic model # get bigrams from biterm.utility import vec_to_biterms vocab = np.array(count_vect.get_feature_names()) biterms = vec_to_biterms(doc_term_matrix[:1000, :]) # %% from biterm.cbtm import oBTM btm = oBTM(num_topics=3, V=vocab) topics = btm.fit_transform(biterms, iterations=100) # %% topics.shape # %% # Find subjects of sentences import spacy nlp = spacy.load("en_core_web_sm")
def perform_BTM(fpath, num_top): company_data = pd.read_excel(fpath) company_name = company_data.iloc[ 0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name) print("This is using %s topics" % num_top) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets = company_data[~filter1].copy() #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace( r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace( r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace( r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Perform standardization on the textual contents of the company's tweets: #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves #def standardize_text(df, text_field): # df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word # df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and' # df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing # df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed # df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing # df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space # df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at' # df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase # #remove double spaces and replace with single space # df[text_field] = df[text_field].str.replace(r"\s+", " ") # return df textual_tweets = standardize_text(company_tweets, "Content") #Examine tweets after standardization has been performed: #print(textual_tweets["Content"].head(5)) #Perform lemmatization on the textual contents of the tweets: ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/ #from textblob import TextBlob, Word #def lem_with_postag(df, text_field): # tag_dict = {"J": 'a', # "N": 'n', # "V": 'v', # "R": 'r'} # output = [] # for tweet in df[text_field]: # sent = TextBlob(tweet) # words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags] # lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] # lemTweet = " ".join(lemmatized_list) # output.append(lemTweet) # return output textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Creating tokens: #from nltk.tokenize import RegexpTokenizer #tokenizer = RegexpTokenizer(r'\w+') #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace( r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") ##!!This doesn't seem to be compatible with BTM! #filtered_toks = [] #Filter out the stop words: #for w in cleanGlish_tweets["tokens"]: #tweet tokens # for j in w: #word tokens within each tweet # if j not in stop_words: # filtered_toks.append(j) ##!Seems possible that I need to filter out tweets with less than 3 words remaining for below to work: #from nltk.tokenize import RegexpTokenizer #tokenizer = RegexpTokenizer(r'\w+') #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) #print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: #cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy() #print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #print("Breakpoint") ##Vectorize the cleaned tweets #from sklearn.feature_extraction.text import CountVectorizer #Filter out stopwords here: vec = CountVectorizer(stop_words=stop_words) ##Seems that a potential problem above is that I'm filtering out tweets w/ less than 3 words before stopword removal: ##Thus, making it possible that tweets with less than 3 counting words are being fed to the model ##!!I think I can supply my own set of stopwords above, rather than use CountVectorizer's pre-defined set #print("Stop words:") #for word in vec.stop_words: # print(word) #Save CountVectorizer's set of stop words: #stop_words = [word for word in vec.stop_words] #print("Stop words variable:") #for word in stop_words: # print(word) ##Filter out tweets w/ less than 3 words after stop word removal: #def clean_tokenize(df, text_field, stop_set): # output = [] # for tweet in df[text_field]: # clean_toks = [] # for tok in tweet: # if tok not in stop_set: # clean_toks.append(tok) # output.append(clean_toks) # return output #from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply( tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #print("Token differences:") #print(cleanGlish_tweets["tokens"].head(5)) #print(cleanGlish_tweets["clean_tokens"].head(5)) print( "Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [ len(token) for token in cleanGlish_tweets["clean_tokens"] ] cleanGlish_tweets2 = cleanGlish_tweets[ cleanGlish_tweets["num_words"] >= 3].copy() print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy() #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"])) #print("Breakpoint") X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #print("X looks like:") #print(X) #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) #print("Vocab is:") #print(vocab) biterms = vec_to_biterms(X) #print("Biterms look like:") #print(biterms) #print("The non-zero parameter we're passing looks like:") #print(np.count_nonzero(X, axis=1)) #print("The sum parameter we're passing in looks like:") #print(np.sum(X, axis=0)) #print("Breakpoint") #Create a BTM and pass the biterms to train it: #from biterm.btm import oBTM #import time start_time = time.time() #random.seed(1) btm = oBTM(num_topics=num_top, V=vocab) topics = btm.fit_transform(biterms, iterations=100) end_time = time.time() run_time = end_time - start_time print("For %s..." % company_name) print("BTM took %s seconds to train" % run_time) #print("First parameter:") #print(btm.phi_wz.T) #print("Topics:") #print(topics) ##See if formatting data in the following manner allows pyLDAvis.prepare to work: #Visualize the topics: #If HTML(vis) doesn't work, look at following link #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/ #import pyLDAvis ##!This isn't working for some reason #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) #pyLDAvis.display(vis) #pyLDAvis.show(vis) #from IPython.core.display import HTML #HTML(vis) #cleanGlish_tweets2["topic"] = topics.argmax() cleanGlish_tweets2["topic"] = [ str(topics[i].argmax()) for i in range(len(cleanGlish_tweets2["Content"])) ] #print("\nTweets and Topics:") #for i in range(len(cleanGlish_tweets2["Content"])): #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax())) # cleanGlish_tweets2.iat[i, 22] = topics[i].argmax() #Examine topic coherence scores: print("\nTopic Coherence:") topic_summuary(btm.phi_wz.T, X, vocab, 10) #Save the tweet topics: respath2 = respath + str(company_name) + resEnding cleanGlish_tweets2.to_excel(respath2)
def perform_BTM(fpath, num_top): company_data = pd.read_excel(fpath) company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name) print("This is using %s topics" % num_top) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets2 = company_data[~filter1].copy() ##Designate tweets as 'OT' or 'IRT' prior to removing more tweets or altering tweet contents #Perform initial separation based on "^@" regex: initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]] initOT = [not elem for elem in initIRT] #print(initOT) #Create IRT and OT variables in the data: company_tweets2["IRT"] = initIRT company_tweets2["OT"] = initOT #Fill in NAs under the 'In Reply To' field with "OT": company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace(np.nan, "OT", regex=True) #print(company_tweets["In Reply To"].head(5)) #Call function to improve on initial OT vs. IRT splits: company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT", "In Reply To", "Author", "OT") #For this version, extract IRT tweets only: company_tweets = company_tweets3[company_tweets3["IRT"] == True].copy() #print(company_tweets.shape) #print("Break") #Create column such that original tweet contents aren't totally lost after textual pre-processing company_tweets["Content2"] = company_tweets["Content"] #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Standardize the textual contents of tweets: textual_tweets = standardize_text(company_tweets, "Content") #Perform lemmatization on the textual contents of tweets: textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") ##Vectorize the cleaned tweets #from sklearn.feature_extraction.text import CountVectorizer #Filter out stopwords here: vec = CountVectorizer(stop_words=stop_words) #Tokenize tweet contents: tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #print("Token differences:") #print(cleanGlish_tweets["tokens"].head(5)) #print(cleanGlish_tweets["clean_tokens"].head(5)) print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]] cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy() print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy() #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"])) #print("Breakpoint") X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #print("X looks like:") #print(X) #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) #print("Vocab is:") #print(vocab) biterms = vec_to_biterms(X) #print("Biterms look like:") #print(biterms) #print("The non-zero parameter we're passing looks like:") #print(np.count_nonzero(X, axis=1)) #print("The sum parameter we're passing in looks like:") #print(np.sum(X, axis=0)) #print("Breakpoint") #Create a BTM and pass the biterms to train it: #from biterm.btm import oBTM #import time start_time = time.time() #random.seed(1) btm = oBTM(num_topics=num_top, V=vocab) topics = btm.fit_transform(biterms, iterations=100) end_time = time.time() run_time = end_time - start_time print("For %s..." % company_name) print("BTM took %s seconds to train" % run_time) #print("First parameter:") #print(btm.phi_wz.T) #print("Topics:") #print(topics) ##See if formatting data in the following manner allows pyLDAvis.prepare to work: #Visualize the topics: #If HTML(vis) doesn't work, look at following link #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/ #import pyLDAvis ##!This isn't working for some reason #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) #pyLDAvis.display(vis) #pyLDAvis.show(vis) #from IPython.core.display import HTML #HTML(vis) #cleanGlish_tweets2["topic"] = topics.argmax() cleanGlish_tweets2["topic"] = [topics[i].argmax() for i in range(len(cleanGlish_tweets2["Content"]))] #print("\nTweets and Topics:") #for i in range(len(cleanGlish_tweets2["Content"])): #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax())) # cleanGlish_tweets2.iat[i, 22] = topics[i].argmax() #Examine topic coherence scores: print("\nTopic Coherence:") topic_summuary(btm.phi_wz.T, X, vocab, 10) #Save the tweet topics: respath2 = respath + str(company_name) + resEnding cleanGlish_tweets2.to_excel(respath2)
def format_data(self): self.data = self.vectorizer.fit_transform(self.df['cleaned_text']).toarray() self.dictionary = np.array(self.vectorizer.get_feature_names()) self.corpus = vec_to_biterms(self.data)