Beispiel #1
0
    def compute_values(self, kmin, kmax, kstep):
        # vectorize doc
        vec = CountVectorizer()
        X = vec.fit_transform(self.docs)
        
        # get vocabulary and biterms from docs
        vocab = np.array(vec.get_feature_names())
        biterms = vec_to_biterms(X)

        # create a BTM and pass the biterms to train it
        btm = oBTM(num_topics = 20, V = vocab)
        topics = btm.fit_transform(biterms, iterations=100)
        topic_summuary(btm.phi_wz.T, X, vocab, 10)
        
def biterm_topic_model_topic_extraction():
    """
    Function performs topic extraction on Tweets using the Gensim HDP model.

    :return: None.
    """
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.")
    log.info(f"{tf}\n")
    log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name")
    log.info(f"{tf_feature_names}\n")

    # Convert corpus of documents (vectorized text) to numpy array.
    tf_array = tf.toarray()

    # Convert dictionary of words (vocabulary) to numpy array.
    tf_feature_names = np.array(tf_vectorizer.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(tf_array)

    # create btm
    btm = oBTM(num_topics=20, V=tf_feature_names)

    print("\n\n Train Online BTM ..")
    for i in range(0, len(biterms), 100):  # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    time.sleep(3)

    # print("\n\n Visualize Topics ..")
    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0))
    # pyLDAvis.save_html(vis, './vis/online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10)

    print("\n\n Texts & Topics ..")
    for i in range(1, 10):
        print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))
Beispiel #3
0
def main(start, end, increment):
    path = Path('C:/Data/Python/JobLoss')
    data_words = []
    with open(path / 'Processed.json') as f:
        data = json.load(f)
        for tweet in data:
            data_words.append(' '.join(tweet[1]))
    vec = CountVectorizer()
    X = vec.fit_transform(data_words).toarray()
    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)
    for k in range(start, end, increment):
        print('Model %s' % k)
        btm = oBTM(num_topics=k, V=vocab)
        for i in range(0, len(biterms), chunksize):
            print('%s / %s' % (i, len(biterms)))
            biterms_chunk = biterms[i:i + chunksize]
            btm.fit(biterms_chunk, iterations=iterations)
        topics = btm.transform(biterms)
        vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X,
                                                                      axis=1),
                               vocab, np.sum(X, axis=0))
        pyLDAvis.save_html(
            vis, str(path / ('Visualizations/BTMVisualization%s.html' % k)))
Beispiel #4
0
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary

if __name__ == "__main__":

    texts = open('./data/reuters.titles').read().splitlines()[:50]

    # vectorize texts
    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=20, V=vocab)

    print("\n\n Train BTM ..")
    topics = btm.fit_transform(biterms, iterations=100)

    # print("\n\n Visualize Topics ..")
    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    # pyLDAvis.save_html(vis, './vis/simple_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    print("\n\n Texts & Topics ..")
Beispiel #5
0
def estimate_BTM(fpath, arr):
    #Read in the data: Below line of code will need to be reconfigured for your filepath
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM estimation process for %s" % company_name)
    
    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets = company_data[~filter1].copy()
    #company_tweets2 = company_data[~filter1].copy()
    
    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes
    
    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))
    
    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "")
    
    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")
    
    #Perform standardization on the textual contents of the company's tweets:
    #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves
    #def standardize_text(df, text_field):
    #    df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word
    #    df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and'
    #    df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing
    #    df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed
    #    df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing
    #    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space
    #    df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at'
    #    df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase
    #    #remove double spaces and replace with single space
    #    df[text_field] = df[text_field].str.replace(r"\s+", " ")
    #    return df
    
    textual_tweets = standardize_text(company_tweets, "Content")
    
    #Examine tweets after standardization has been performed:
    #print(textual_tweets["Content"].head(5))
    
    #Perform lemmatization on the textual contents of the tweets:
    ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    #from textblob import TextBlob, Word
    
    #def lem_with_postag(df, text_field):
    #    tag_dict = {"J": 'a',
    #                "N": 'n',
    #                "V": 'v',
    #                "R": 'r'}
    #    output = []
    #    for tweet in df[text_field]:
    #        sent = TextBlob(tweet)
    #        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    #        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    #        lemTweet = " ".join(lemmatized_list)
    #        output.append(lemTweet)
    #    return output
    
    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))
    
    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]
    
    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]
    
    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    
        
    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing
    
    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)
            
    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")
    
    #from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    
    ##Filter out tweets w/ less than 3 words after stop word removal:
    #def clean_tokenize(df, text_field, stop_set):
    #    output = []
    #    for tweet in df[text_field]:
    #        clean_toks = []
    #        for tok in tweet:
    #            if tok not in stop_set:
    #                clean_toks.append(tok)
    #        output.append(clean_toks)
    #    return output
    
    
    #from nltk.tokenize import RegexpTokenizer
        
    tokenizer = RegexpTokenizer(r'\w+')
        
    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words)
    
    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]]
    cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy()
    
    #Extract the remaining textual contents of tweets:
    #clean_tokens = cleanGlish_tweets2["clean_tokens"]
    #Doesn't hurt to examine some of them:
    #print(cleanGlish_tweets2["clean_tokens"].head(5))
    #print("Break")
    
    #x = vectorizer.fit_transform(clean_tokens)
    #x = vectorizer.fit_transform(cleanGlish_tweets2["clean_tokens"])
    #x = vectorizer.fit_transform(str(clean_tokens))
    #clean_tokens = [clean_tokens]
    #x = vectorizer.fit_transform(clean_tokens)
    #x = vectorizer.fit_transform(str(clean_tokens))
    #x = vectorizer.fit_transform(cleanGlish_tweets2["clean_tokens"].str)
    cleanGlish_tweets2["clean_tokens"] = [" ".join(tok) for tok in cleanGlish_tweets2["clean_tokens"].values]
    #print(cleanGlish_tweets2["clean_tokens"].head(5))
    #print("Break")
    clean_tweets = cleanGlish_tweets2["clean_tokens"]
    x = vectorizer.fit_transform(clean_tweets)
    
    
    #import matplotlib.pyplot as plt
    #from sklearn.cluster import KMeans
    #from sklearn.metrics import silhouette_score
    
    sum_squared_dists = []
    km_silh = []
    #Considering I have yet to see a best k greater than 13, I'm reducing K's range from (2, 21) to (2, 16)
    ##However, since this is lightning BTM, might as well revert back to K = range(2, 21)
    ##Might do that for an overnight run or something, reverting back to (2,16) for now
    ##Actually, might as well get this going
    K = range(2, 21)
    
    for k in K:
        km = KMeans(n_clusters=k, max_iter=200, n_init=10)
        km = km.fit(x)
        preds = km.predict(x)
        silh = silhouette_score(x, preds)
        sum_squared_dists.append(km.inertia_)
        km_silh.append(silh)
        
    plt.plot(K, sum_squared_dists, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('%s Elbow Method for Optimal k' % company_name)
    #plt.show()
    for i in range(len(K)):
        label = "{:.2f}".format(arr[i])
        plt.annotate(label,
                     (K[i], sum_squared_dists[i]),
                     textcoords = "offset points",
                     xytext = (3, 5),
                     ha='center',
                     fontsize=5)    
    figpath2 = figpath + str(company_name) + 'elbow.png'
    plt.savefig(figpath2)
    
    #######################################################################
    #See if silhouette scores are better for determining optimal k
    #from sklearn.preprocessing import MinMaxScaler
    #scaler = MinMaxScaler()
    #Actually, think this can all be done above as well
    
    plt.figure(figsize=(7,4))
    plt.title("%s Silhouette Scores" % company_name)
    plt.scatter(x=[i for i in range(2,21)],y=km_silh,s=150,edgecolor='k')
    plt.grid(True)
    plt.xlabel("Number of clusters",fontsize=6)
    plt.ylabel("Silhouette score",fontsize=6)
    plt.xticks([i for i in range(2,21)],fontsize=8)
    plt.yticks(fontsize=8)
    #plt.show()
    for i in range(len(K)):
        label = "{:.2f}".format(arr[i])
        plt.annotate(label,
                     (K[i], km_silh[i]),
                     textcoords = "offset points",
                     xytext = (0, 9),
                     ha='center',
                     fontsize=6)    
    figpath3 = figpath + str(company_name) + 'silhouetteScores.png'
    plt.savefig(figpath3)
    
    #plt.figure(figsize=(10,10))
    #plt.title("%s Silhouette Scores" % company_name)
    #plt.xlabel('k')
    #plt.ylabel('Silhouette Score')
    #plt.scatter(x=rangemax(K)), y = km_silh)
    #plt.scatter(x=[i for i in range(2, np.max(K))], y = km_silh)
    #figpath4 = figpath + str(company_name) + '_silhouetteScoresAgain.png'
    #plt.savefig(figpath4)
                
    
    
    print("\nSilhouette scores:")
    for val in km_silh:
        print(val)
    
    #Function to calculate percent change in silhouette scores
    #Code derived from: https://stackoverflow.com/questions/30926840/how-to-check-change-between-two-values-in-percent
    #def get_change(current, previous):
    #    if current == previous:
    #        return 0
    #    try:
    #        return ((current - previous) / previous) * 100.0
    #    except ZeroDivisionError:
    #        return -1000
    
    #Calculate percent changes:
    changes = [0]
    for i in range(len(km_silh) - 1):
        j = i + 1
        change = get_change(km_silh[j], km_silh[i])
        changes.append(change)
    
    #Examine percent changes:
    print("\nPercent changes:")
    for val in changes:
        print(val)
        
    #Determine which k values are suitable for testing:
    potential_k = []
    
    for i in range(len(changes)):
        if changes[i] < 1 and i != 0: #if the silhouette score decreased, or only increased by less than 1% (and it's not the first obs, which always has 0% increase)
            k = i + 1 # + 1 instead of 2 because we want to grab the value before the decrease (or insignificant increase)
            potential_k.append(k)
    
    print("For %s, the k values to be tested are:" % company_name)
    print(potential_k)
    print("However, in this version, all potential k's 2-20 will be tested")
    
    
    ##############################################################################################################################################
    #BTM online training:
    
    #Bring in the vectorizer to be used for BTM and supply pre-defined stopwords
    #from sklearn.feature_extraction.text import CountVectorizer
    vec = CountVectorizer(stop_words=stop_words)
    
    #Vectorize the tweets:
    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()
    
    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary
    
    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)
    
    #Create a BTM and pass the biterms to train it, per k value in potential_k:
    #from biterm.btm import oBTM
    #import time
    best_k = []
    best_coherence = []
    
    #Function to perform online BTM training
    #def speedyBTM(num_top, vocabulary, b_terms):
    #    btm = oBTM(num_topics=num_top, V=vocabulary) #create the btm object
    #    start_time = time.time()
    #    for i in range(0, len(b_terms), 100): #process chunks of 200 texts
    #        biterms_chunk = biterms[i:i + 100]
    #        btm.fit(biterms_chunk, iterations=50)
    #    topics = btm.transform(biterms)
    #    end_time = time.time()
    #    run_time = end_time - start_time
    #    print("For k = %s topics.." % num_top)
    #    print("BTM online took %s seconds to train" % run_time)
    #    #Examine topic coherence scores:
    #    print("\nTopic Coherence:")
    #    topic_summuary(btm.phi_wz.T, X, vocab, 10)
    
    
    total_start = time.time()
    #Train a BTM model on each potential k: (Not quite)
    #Train a BTM model on all potential k's 2-15:
    for k in K:
        lightningBTM(k, vocab, biterms, X)
        
    total_end = time.time()
    total_time = total_end - total_start
    print("For %s, total BTM estimation run-time was %s" % (company_name, total_time))
Beispiel #6
0
def estimate_BTM(fpath):
    #Read in the data: Below line of code will need to be reconfigured for your filepath
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[
        0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM estimation process for %s" % company_name)

    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets2 = company_data[~filter1].copy()

    ##Mark tweets as OT or IRT:
    #Perform initial separation based on "^@" regex:
    initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]]
    initOT = [not elem for elem in initIRT]
    #print(initOT)

    #Create IRT and OT variables in the data:
    company_tweets2["IRT"] = initIRT
    company_tweets2["OT"] = initOT

    #Fill in NAs under the 'In Reply To' field with "OT":
    company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace(
        np.nan, "OT", regex=True)

    #print(company_tweets["In Reply To"].head(5))

    #Clean up initial OT/IRT separation:
    def cleanSplit(tweets, text1, text2, text3, text4, text5):
        for i in range(len(tweets[text1])):
            if tweets.iloc[i, tweets.columns.get_loc(
                    text2)] == True:  #if the tweet was marked IRT initially
                if tweets.iloc[
                        i, tweets.columns.get_loc(text3)] == tweets.iloc[
                            i, tweets.columns.
                            get_loc(text4)]:  #but it's in reply to the company
                    j = i  #then index our current position so that we may examine the 'next' (technically previous) tweet
                    while tweets.iloc[
                            j, tweets.columns.get_loc(text3)] == tweets.iloc[
                                j, tweets.columns.get_loc(
                                    text4)]:  #while this continues to be true
                        j = j + 1  #keep following the chain
                    if tweets.iloc[j, tweets.columns.get_loc(
                            text3
                    )] == "OT":  #if an official tweet is at the end of the chain
                        tweets.iat[
                            i,
                            19] = False  #then the original tweet is part of an official thread, not true IRT
                        tweets.iat[i, 20] = True
                        #print(tweets.iloc[i, tweets.columns.get_loc(text1)])
        return tweets

    company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT",
                                 "In Reply To", "Author", "OT")

    #For this version, extract official tweets only:
    company_tweets = company_tweets3[company_tweets3["OT"] == True].copy()
    #print(company_tweets.shape)
    #print("Break")

    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"’", "'")  #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"‘", "'")  #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"“", "\"")  #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"”", "\"")  #replace closing smart quotes with regular quotes

    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))

    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"'s", "")

    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")

    #Standardize the textual contents of tweets
    textual_tweets = standardize_text(company_tweets, "Content")

    #Lemmatize the textual contents of tweets:
    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))

    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]

    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]

    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))

    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(
        r"'",
        "")  #replace apostrophes in initial set of stopwords with nothing

    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)

    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")

    #from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(stop_words=stop_words)

    #from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')

    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(
        tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets,
                                                       "tokens", stop_words)

    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [
        len(token) for token in cleanGlish_tweets["clean_tokens"]
    ]
    cleanGlish_tweets2 = cleanGlish_tweets[
        cleanGlish_tweets["num_words"] >= 3].copy()

    #Extract the remaining textual contents of tweets:
    cleanGlish_tweets2["clean_tokens"] = [
        " ".join(tok) for tok in cleanGlish_tweets2["clean_tokens"].values
    ]
    #print(cleanGlish_tweets2["clean_tokens"].head(5))
    #print("Break")
    clean_tweets = cleanGlish_tweets2["clean_tokens"]
    x = vectorizer.fit_transform(clean_tweets)

    #import matplotlib.pyplot as plt
    #from sklearn.cluster import KMeans
    #from sklearn.metrics import silhouette_score

    sum_squared_dists = []
    km_silh = []

    #Set range of k values desired to be attempted:
    K = range(3, 21)

    for k in K:
        km = KMeans(n_clusters=k, max_iter=200, n_init=10)
        km = km.fit(x)
        preds = km.predict(x)
        silh = silhouette_score(x, preds)
        sum_squared_dists.append(km.inertia_)
        km_silh.append(silh)

    plt.plot(K, sum_squared_dists, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('%s Elbow Method for Optimal k' % company_name)
    #plt.show()
    figpath2 = figpath + str(company_name) + 'elbow.png'
    plt.savefig(figpath2)

    #######################################################################
    #See if silhouette scores are better for determining optimal k

    plt.figure(figsize=(7, 4))
    plt.title("%s Silhouette Scores" % company_name)
    plt.scatter(x=[i for i in range(3, 21)], y=km_silh, s=150, edgecolor='k')
    plt.grid(True)
    plt.xlabel("Number of clusters", fontsize=6)
    plt.ylabel("Silhouette score", fontsize=6)
    plt.xticks([i for i in range(3, 21)], fontsize=8)
    plt.yticks(fontsize=8)
    #plt.show()
    figpath3 = figpath + str(company_name) + 'silhouetteScores.png'
    plt.savefig(figpath3)

    print("\nSilhouette scores:")
    for val in km_silh:
        print(val)

    #Calculate percent changes:
    changes = [0]
    for i in range(len(km_silh) - 1):
        j = i + 1
        change = get_change(km_silh[j], km_silh[i])
        changes.append(change)

    #Examine percent changes:
    print("\nPercent changes:")
    for val in changes:
        print(val)

    #Determine which k values are suitable for testing:
    potential_k = []

    for i in range(len(changes)):
        if changes[
                i] < 1 and i != 0:  #if the silhouette score decreased, or only increased by less than 1% (and it's not the first obs, which always has 0% increase)
            k = i + 1  # + 1 instead of 2 because we want to grab the value before the decrease (or insignificant increase)
            potential_k.append(k)

    print("For %s, the k values to be tested are:" % company_name)
    print(potential_k)
    print("However, in this version, all potential k's 2-20 will be tested")

    ##############################################################################################################################################
    #BTM online training:

    #Bring in the vectorizer to be used for BTM and supply pre-defined stopwords
    #from sklearn.feature_extraction.text import CountVectorizer
    vec = CountVectorizer(stop_words=stop_words)

    #Vectorize the tweets:
    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()

    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary

    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)

    #Create a BTM and pass the biterms to train it, per k value in potential_k:
    #from biterm.btm import oBTM
    #import time
    best_k = []
    best_coherence = []

    total_start = time.time()
    #Train a BTM model on each potential k: (Not quite)
    #Train a BTM model on all potential k's 2-15:
    for k in K:
        lightningBTM(k, vocab, biterms, X)

    total_end = time.time()
    total_time = total_end - total_start
    print("For %s, total BTM estimation run-time was %s" %
          (company_name, total_time))
Beispiel #7
0
 def _analyze_texts(self):
     vec = CountVectorizer(stop_words='english')
     self.X = vec.fit_transform(self.documents).toarray()
     self.vocab = np.array(vec.get_feature_names())
     self.biterms = vec_to_biterms(self.X)
Beispiel #8
0
df_cl = pd.DataFrame({
    'message_en': df['message_en'],
    'cluster': km.predict(doc_term_matrix)
})
# %%
df_cl.query('cluster ==1')

# %%

# Biterm topic model

# get bigrams
from biterm.utility import vec_to_biterms

vocab = np.array(count_vect.get_feature_names())
biterms = vec_to_biterms(doc_term_matrix[:1000, :])
# %%

from biterm.cbtm import oBTM

btm = oBTM(num_topics=3, V=vocab)
topics = btm.fit_transform(biterms, iterations=100)
# %%
topics.shape
# %%

# Find subjects of sentences

import spacy

nlp = spacy.load("en_core_web_sm")
def perform_BTM(fpath, num_top):
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[
        0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name)
    print("This is using %s topics" % num_top)

    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets = company_data[~filter1].copy()

    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"’", "'")  #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"‘", "'")  #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"“", "\"")  #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"”", "\"")  #replace closing smart quotes with regular quotes

    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))

    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"'s", "")

    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")

    #Perform standardization on the textual contents of the company's tweets:
    #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves
    #def standardize_text(df, text_field):
    #    df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word
    #    df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and'
    #    df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing
    #    df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed
    #    df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing
    #    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space
    #    df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at'
    #    df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase
    #    #remove double spaces and replace with single space
    #    df[text_field] = df[text_field].str.replace(r"\s+", " ")
    #    return df

    textual_tweets = standardize_text(company_tweets, "Content")

    #Examine tweets after standardization has been performed:
    #print(textual_tweets["Content"].head(5))

    #Perform lemmatization on the textual contents of the tweets:
    ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    #from textblob import TextBlob, Word

    #def lem_with_postag(df, text_field):
    #    tag_dict = {"J": 'a',
    #                "N": 'n',
    #                "V": 'v',
    #                "R": 'r'}
    #    output = []
    #    for tweet in df[text_field]:
    #        sent = TextBlob(tweet)
    #        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    #        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    #        lemTweet = " ".join(lemmatized_list)
    #        output.append(lemTweet)
    #    return output

    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))

    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]

    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]

    #Creating tokens:
    #from nltk.tokenize import RegexpTokenizer

    #tokenizer = RegexpTokenizer(r'\w+')

    #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)

    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))

    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(
        r"'",
        "")  #replace apostrophes in initial set of stopwords with nothing

    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)

    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")

    ##!!This doesn't seem to be compatible with BTM!
    #filtered_toks = []

    #Filter out the stop words:
    #for w in cleanGlish_tweets["tokens"]: #tweet tokens
    #    for j in w: #word tokens within each tweet
    #        if j not in stop_words:
    #            filtered_toks.append(j)

    ##!Seems possible that I need to filter out tweets with less than 3 words remaining for below to work:
    #from nltk.tokenize import RegexpTokenizer

    #tokenizer = RegexpTokenizer(r'\w+')

    #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)
    #print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    #cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy()
    #print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"]))
    #print("Breakpoint")

    ##Vectorize the cleaned tweets
    #from sklearn.feature_extraction.text import CountVectorizer

    #Filter out stopwords here:
    vec = CountVectorizer(stop_words=stop_words)
    ##Seems that a potential problem above is that I'm filtering out tweets w/ less than 3 words before stopword removal:
    ##Thus, making it possible that tweets with less than 3 counting words are being fed to the model
    ##!!I think I can supply my own set of stopwords above, rather than use CountVectorizer's pre-defined set
    #print("Stop words:")
    #for word in vec.stop_words:
    #    print(word)

    #Save CountVectorizer's set of stop words:
    #stop_words = [word for word in vec.stop_words]
    #print("Stop words variable:")
    #for word in stop_words:
    #    print(word)

    ##Filter out tweets w/ less than 3 words after stop word removal:
    #def clean_tokenize(df, text_field, stop_set):
    #    output = []
    #    for tweet in df[text_field]:
    #        clean_toks = []
    #        for tok in tweet:
    #            if tok not in stop_set:
    #                clean_toks.append(tok)
    #        output.append(clean_toks)
    #    return output

    #from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')

    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(
        tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets,
                                                       "tokens", stop_words)

    #print("Token differences:")
    #print(cleanGlish_tweets["tokens"].head(5))
    #print(cleanGlish_tweets["clean_tokens"].head(5))

    print(
        "Before filtering out tweets with 3 words or less, cleanGlish has %s tweets"
        % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [
        len(token) for token in cleanGlish_tweets["clean_tokens"]
    ]
    cleanGlish_tweets2 = cleanGlish_tweets[
        cleanGlish_tweets["num_words"] >= 3].copy()
    print("After filtering, cleanGlish2 has %s tweets" %
          len(cleanGlish_tweets2["Content"]))
    #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference
    #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy()
    #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"]))

    #print("Breakpoint")

    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()
    #print("X looks like:")
    #print(X)

    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary

    vocab = np.array(vec.get_feature_names())
    #print("Vocab is:")
    #print(vocab)
    biterms = vec_to_biterms(X)
    #print("Biterms look like:")
    #print(biterms)
    #print("The non-zero parameter we're passing looks like:")
    #print(np.count_nonzero(X, axis=1))
    #print("The sum parameter we're passing in looks like:")
    #print(np.sum(X, axis=0))
    #print("Breakpoint")

    #Create a BTM and pass the biterms to train it:
    #from biterm.btm import oBTM
    #import time
    start_time = time.time()

    #random.seed(1)
    btm = oBTM(num_topics=num_top, V=vocab)
    topics = btm.fit_transform(biterms, iterations=100)
    end_time = time.time()
    run_time = end_time - start_time
    print("For %s..." % company_name)
    print("BTM took %s seconds to train" % run_time)

    #print("First parameter:")
    #print(btm.phi_wz.T)
    #print("Topics:")
    #print(topics)

    ##See if formatting data in the following manner allows pyLDAvis.prepare to work:

    #Visualize the topics:
    #If HTML(vis) doesn't work, look at following link
    #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/
    #import pyLDAvis
    ##!This isn't working for some reason
    #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    #pyLDAvis.display(vis)
    #pyLDAvis.show(vis)
    #from IPython.core.display import HTML
    #HTML(vis)
    #cleanGlish_tweets2["topic"] = topics.argmax()
    cleanGlish_tweets2["topic"] = [
        str(topics[i].argmax())
        for i in range(len(cleanGlish_tweets2["Content"]))
    ]

    #print("\nTweets and Topics:")
    #for i in range(len(cleanGlish_tweets2["Content"])):
    #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax()))
    #    cleanGlish_tweets2.iat[i, 22] = topics[i].argmax()

    #Examine topic coherence scores:
    print("\nTopic Coherence:")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    #Save the tweet topics:
    respath2 = respath + str(company_name) + resEnding
    cleanGlish_tweets2.to_excel(respath2)
def perform_BTM(fpath, num_top):
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name)
    print("This is using %s topics" % num_top)
    
    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets2 = company_data[~filter1].copy()
    
    ##Designate tweets as 'OT' or 'IRT' prior to removing more tweets or altering tweet contents
    #Perform initial separation based on "^@" regex:
    initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]]
    initOT = [not elem for elem in initIRT]
    #print(initOT)
    
    #Create IRT and OT variables in the data:
    company_tweets2["IRT"] = initIRT
    company_tweets2["OT"] = initOT
    
    
    #Fill in NAs under the 'In Reply To' field with "OT":
    company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace(np.nan, "OT", regex=True)
    #print(company_tweets["In Reply To"].head(5))
    
    #Call function to improve on initial OT vs. IRT splits:
    company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT", "In Reply To", "Author", "OT")
    
    #For this version, extract IRT tweets only:
    company_tweets = company_tweets3[company_tweets3["IRT"] == True].copy()
    #print(company_tweets.shape)
    #print("Break")    
    
    #Create column such that original tweet contents aren't totally lost after textual pre-processing
    company_tweets["Content2"] = company_tweets["Content"]
    
    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes
    
    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))
    
    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "")
    
    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")
    
    #Standardize the textual contents of tweets:
    textual_tweets = standardize_text(company_tweets, "Content")
    
    
    #Perform lemmatization on the textual contents of tweets:
    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))
    
    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]
    
    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]
    
    
    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    
        
    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing
    
    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)
            
    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")
    
    
                
    
    ##Vectorize the cleaned tweets
    #from sklearn.feature_extraction.text import CountVectorizer
    
    #Filter out stopwords here:
    vec = CountVectorizer(stop_words=stop_words)
    
    #Tokenize tweet contents:    
    tokenizer = RegexpTokenizer(r'\w+')
        
    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words)
    
    #print("Token differences:")
    #print(cleanGlish_tweets["tokens"].head(5))
    #print(cleanGlish_tweets["clean_tokens"].head(5))
    
    print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]]
    cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy()
    print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"]))
    #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference
    #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy()
    #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"]))
    
    #print("Breakpoint")
    
    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()
    #print("X looks like:")
    #print(X)
    
    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary
    
    vocab = np.array(vec.get_feature_names())
    #print("Vocab is:")
    #print(vocab)
    biterms = vec_to_biterms(X)
    #print("Biterms look like:")
    #print(biterms)
    #print("The non-zero parameter we're passing looks like:")
    #print(np.count_nonzero(X, axis=1))
    #print("The sum parameter we're passing in looks like:")
    #print(np.sum(X, axis=0))
    #print("Breakpoint")
    
    
    #Create a BTM and pass the biterms to train it:
    #from biterm.btm import oBTM
    #import time
    start_time = time.time()
    
    #random.seed(1)
    btm = oBTM(num_topics=num_top, V=vocab)
    topics = btm.fit_transform(biterms, iterations=100)
    end_time = time.time()
    run_time = end_time - start_time
    print("For %s..." % company_name)
    print("BTM took %s seconds to train" % run_time)
    
    #print("First parameter:")
    #print(btm.phi_wz.T)
    #print("Topics:")
    #print(topics)
    
    ##See if formatting data in the following manner allows pyLDAvis.prepare to work:
    
    
    #Visualize the topics:
    #If HTML(vis) doesn't work, look at following link
    #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/
    #import pyLDAvis
    ##!This isn't working for some reason
    #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    #pyLDAvis.display(vis)
    #pyLDAvis.show(vis)
    #from IPython.core.display import HTML
    #HTML(vis)
    #cleanGlish_tweets2["topic"] = topics.argmax()
    cleanGlish_tweets2["topic"] = [topics[i].argmax() for i in range(len(cleanGlish_tweets2["Content"]))]
    
    
    #print("\nTweets and Topics:")
    #for i in range(len(cleanGlish_tweets2["Content"])):
        #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax()))
    #    cleanGlish_tweets2.iat[i, 22] = topics[i].argmax()
    
    #Examine topic coherence scores:
    print("\nTopic Coherence:")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)
    
    #Save the tweet topics:
    respath2 = respath + str(company_name) + resEnding
    cleanGlish_tweets2.to_excel(respath2)
 def format_data(self):
     self.data = self.vectorizer.fit_transform(self.df['cleaned_text']).toarray()
     self.dictionary = np.array(self.vectorizer.get_feature_names())
     self.corpus = vec_to_biterms(self.data)