Python oBTM Beispiele, biterm.btm.oBTM Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_btm.py Projekt: zhangweipu/biterm

def test_bBTM():

    bbtm = oBTM(K, V, alpha, beta)
    t0_batch = time()
    bbtm.fit(B_, iterations=iterations)
    t1_batch = time() - t0_batch
    print()
    print("Batch: {:0.2f} done in {:0.2f}s\n\n".format(max(bbtm.theta_z),
                                                       t1_batch))

    assert max(bbtm.theta_z) > threshold
    return bbtm

Beispiel #2

0

Datei anzeigen

def lightningBTM(num_top, vocabulary, b_terms, x1):
    btm = oBTM(num_topics=num_top, V=vocabulary) #create the btm object
    start_time = time.time()
    for i in range(0, len(b_terms), 100): #process chunks of 200 texts
        biterms_chunk = b_terms[i:i + 100]
        btm.fit(biterms_chunk, iterations=10) #only 10 iterations in this version, instead of 50
    topics = btm.transform(b_terms)
    end_time = time.time()
    run_time = end_time - start_time
    print("For k = %s topics.." % num_top)
    print("BTM online took %s seconds to train" % run_time)
    #Examine topic coherence scores:
    print("\nTopic Coherence:")
    topic_summuary(btm.phi_wz.T, x1, vocabulary, 10)

Beispiel #3

0

Datei anzeigen

Datei: test_btm.py Projekt: zhangweipu/biterm

def test_oBTM():

    obtm = oBTM(K, V, alpha, beta, l=1.)
    t0_online = time()
    for i in range(0, len(B_), B_chunk_size):
        B_d_ = B_[i:i + B_chunk_size]
        obtm.fit(B_d_, iterations=iterations)
    t1_online = time() - t0_online
    print()
    print("Online: {:0.2f} done in {:0.2f}s\n\n".format(
        max(obtm.theta_z), t1_online))

    assert max(obtm.theta_z) > threshold
    return obtm

Beispiel #4

0

Datei anzeigen

def main(start, end, increment):
    path = Path('C:/Data/Python/JobLoss')
    data_words = []
    with open(path / 'Processed.json') as f:
        data = json.load(f)
        for tweet in data:
            data_words.append(' '.join(tweet[1]))
    vec = CountVectorizer()
    X = vec.fit_transform(data_words).toarray()
    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)
    for k in range(start, end, increment):
        print('Model %s' % k)
        btm = oBTM(num_topics=k, V=vocab)
        for i in range(0, len(biterms), chunksize):
            print('%s / %s' % (i, len(biterms)))
            biterms_chunk = biterms[i:i + chunksize]
            btm.fit(biterms_chunk, iterations=iterations)
        topics = btm.transform(biterms)
        vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X,
                                                                      axis=1),
                               vocab, np.sum(X, axis=0))
        pyLDAvis.save_html(
            vis, str(path / ('Visualizations/BTMVisualization%s.html' % k)))

Beispiel #5

0

Datei anzeigen

if __name__ == "__main__":

    texts = open('./data/reuters.titles').read().splitlines()[:50]

    # vectorize texts
    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=20, V=vocab)

    print("\n\n Train BTM ..")
    topics = btm.fit_transform(biterms, iterations=100)

    # print("\n\n Visualize Topics ..")
    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    # pyLDAvis.save_html(vis, './vis/simple_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    print("\n\n Texts & Topics ..")
    for i in range(len(texts)):
        print("{} (topic: {})".format(texts[i], topics[i].argmax()))

Beispiel #6

0

Datei anzeigen

Datei: Clickbaitness_Analysis_with_BTM_(CNN+LSTM_model).py Projekt: sawinderkaurvohra/Clickbait-Detection

texts = list(
    pd.read_csv('../dataset/clickandnonclick_32000.csv',
                encoding='ISO-8859-1')['text'])[1200:1700]
# vectorize texts
vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(texts).toarray()

# get vocabulary
vocab = np.array(vec.get_feature_names())

# get biterms
biterms = vec_to_biterms(X)

# create btm
btm = oBTM(num_topics=9, V=vocab)

print("\n\n Train BTM ..")
topics = btm.fit_transform(biterms, iterations=100)

print("\n\n Visualize Topics ..")
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1),
                       vocab, np.sum(X, axis=0))
pyLDAvis.save_html(vis, '../assets/BTM.html')

print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, X, vocab, 20)

print("\n\n Texts & Topics ..")
for i in range(len(texts)):
    print("{} (topic: {})".format(texts[i], topics[i].argmax()))

Beispiel #7

0

Datei anzeigen

#Get the vocabulary and the biterms from the tweets:
from biterm.utility import vec_to_biterms, topic_summuary

vocab = np.array(vec.get_feature_names())
#print("Vocab is:")
#print(vocab)
biterms = vec_to_biterms(X)

#Create a BTM and pass the biterms to train it:
from biterm.btm import oBTM
import time

start_time = time.time()

btm = oBTM(num_topics=13, V=vocab)
##Online BTM training, link = https://pypi.org/project/biterm/
print("\nTrain Online BTM")
for i in range(0, len(biterms), 100):  #process chunks of 200 texts
    biterms_chunk = biterms[i:i + 100]
    btm.fit(biterms_chunk, iterations=50)
topics = btm.transform(biterms)

end_time = time.time()
run_time = end_time - start_time
print("BTM online took %s seconds to train" % run_time)

print("\nTweets and Topics:")
for i in range(len(cleanGlish_tweets2["Content"])):
    print("{} (topic: {})".format(
        cleanGlish_tweets2.iloc[i,

Beispiel #8

0

Datei anzeigen

Datei: biterm_model.py Projekt: junronglau/product-defects-mining

 def __init__(self, config, preprocessor):
     self.model_path = config.labels_generator.paths.save_model_path
     self.dictionary = preprocessor.dictionary
     self.model = oBTM(num_topics=config.labels_generator.model.num_topics,
                       V=self.dictionary)
     self.iterations = config.labels_generator.model.iterations

Beispiel #9

0

Datei anzeigen

Datei: BTM_model1.py Projekt: PenTompkins/Tompkins_OUDSA5900

def perform_BTM(fpath, num_top):
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[
        0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name)
    print("This is using %s topics" % num_top)

    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets = company_data[~filter1].copy()

    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"’", "'")  #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"‘", "'")  #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"“", "\"")  #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"”", "\"")  #replace closing smart quotes with regular quotes

    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))

    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(
        r"'s", "")

    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")

    #Perform standardization on the textual contents of the company's tweets:
    #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves
    #def standardize_text(df, text_field):
    #    df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word
    #    df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and'
    #    df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing
    #    df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed
    #    df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing
    #    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space
    #    df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at'
    #    df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase
    #    #remove double spaces and replace with single space
    #    df[text_field] = df[text_field].str.replace(r"\s+", " ")
    #    return df

    textual_tweets = standardize_text(company_tweets, "Content")

    #Examine tweets after standardization has been performed:
    #print(textual_tweets["Content"].head(5))

    #Perform lemmatization on the textual contents of the tweets:
    ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    #from textblob import TextBlob, Word

    #def lem_with_postag(df, text_field):
    #    tag_dict = {"J": 'a',
    #                "N": 'n',
    #                "V": 'v',
    #                "R": 'r'}
    #    output = []
    #    for tweet in df[text_field]:
    #        sent = TextBlob(tweet)
    #        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    #        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    #        lemTweet = " ".join(lemmatized_list)
    #        output.append(lemTweet)
    #    return output

    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))

    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]

    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]

    #Creating tokens:
    #from nltk.tokenize import RegexpTokenizer

    #tokenizer = RegexpTokenizer(r'\w+')

    #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)

    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))

    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(
        r"'",
        "")  #replace apostrophes in initial set of stopwords with nothing

    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)

    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")

    ##!!This doesn't seem to be compatible with BTM!
    #filtered_toks = []

    #Filter out the stop words:
    #for w in cleanGlish_tweets["tokens"]: #tweet tokens
    #    for j in w: #word tokens within each tweet
    #        if j not in stop_words:
    #            filtered_toks.append(j)

    ##!Seems possible that I need to filter out tweets with less than 3 words remaining for below to work:
    #from nltk.tokenize import RegexpTokenizer

    #tokenizer = RegexpTokenizer(r'\w+')

    #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)
    #print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    #cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy()
    #print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"]))
    #print("Breakpoint")

    ##Vectorize the cleaned tweets
    #from sklearn.feature_extraction.text import CountVectorizer

    #Filter out stopwords here:
    vec = CountVectorizer(stop_words=stop_words)
    ##Seems that a potential problem above is that I'm filtering out tweets w/ less than 3 words before stopword removal:
    ##Thus, making it possible that tweets with less than 3 counting words are being fed to the model
    ##!!I think I can supply my own set of stopwords above, rather than use CountVectorizer's pre-defined set
    #print("Stop words:")
    #for word in vec.stop_words:
    #    print(word)

    #Save CountVectorizer's set of stop words:
    #stop_words = [word for word in vec.stop_words]
    #print("Stop words variable:")
    #for word in stop_words:
    #    print(word)

    ##Filter out tweets w/ less than 3 words after stop word removal:
    #def clean_tokenize(df, text_field, stop_set):
    #    output = []
    #    for tweet in df[text_field]:
    #        clean_toks = []
    #        for tok in tweet:
    #            if tok not in stop_set:
    #                clean_toks.append(tok)
    #        output.append(clean_toks)
    #    return output

    #from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')

    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(
        tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets,
                                                       "tokens", stop_words)

    #print("Token differences:")
    #print(cleanGlish_tweets["tokens"].head(5))
    #print(cleanGlish_tweets["clean_tokens"].head(5))

    print(
        "Before filtering out tweets with 3 words or less, cleanGlish has %s tweets"
        % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [
        len(token) for token in cleanGlish_tweets["clean_tokens"]
    ]
    cleanGlish_tweets2 = cleanGlish_tweets[
        cleanGlish_tweets["num_words"] >= 3].copy()
    print("After filtering, cleanGlish2 has %s tweets" %
          len(cleanGlish_tweets2["Content"]))
    #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference
    #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy()
    #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"]))

    #print("Breakpoint")

    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()
    #print("X looks like:")
    #print(X)

    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary

    vocab = np.array(vec.get_feature_names())
    #print("Vocab is:")
    #print(vocab)
    biterms = vec_to_biterms(X)
    #print("Biterms look like:")
    #print(biterms)
    #print("The non-zero parameter we're passing looks like:")
    #print(np.count_nonzero(X, axis=1))
    #print("The sum parameter we're passing in looks like:")
    #print(np.sum(X, axis=0))
    #print("Breakpoint")

    #Create a BTM and pass the biterms to train it:
    #from biterm.btm import oBTM
    #import time
    start_time = time.time()

    #random.seed(1)
    btm = oBTM(num_topics=num_top, V=vocab)
    topics = btm.fit_transform(biterms, iterations=100)
    end_time = time.time()
    run_time = end_time - start_time
    print("For %s..." % company_name)
    print("BTM took %s seconds to train" % run_time)

    #print("First parameter:")
    #print(btm.phi_wz.T)
    #print("Topics:")
    #print(topics)

    ##See if formatting data in the following manner allows pyLDAvis.prepare to work:

    #Visualize the topics:
    #If HTML(vis) doesn't work, look at following link
    #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/
    #import pyLDAvis
    ##!This isn't working for some reason
    #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    #pyLDAvis.display(vis)
    #pyLDAvis.show(vis)
    #from IPython.core.display import HTML
    #HTML(vis)
    #cleanGlish_tweets2["topic"] = topics.argmax()
    cleanGlish_tweets2["topic"] = [
        str(topics[i].argmax())
        for i in range(len(cleanGlish_tweets2["Content"]))
    ]

    #print("\nTweets and Topics:")
    #for i in range(len(cleanGlish_tweets2["Content"])):
    #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax()))
    #    cleanGlish_tweets2.iat[i, 22] = topics[i].argmax()

    #Examine topic coherence scores:
    print("\nTopic Coherence:")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    #Save the tweet topics:
    respath2 = respath + str(company_name) + resEnding
    cleanGlish_tweets2.to_excel(respath2)

Beispiel #10

0

Datei anzeigen

Datei: BTM_model_IRT.py Projekt: PenTompkins/Tompkins_OUDSA5900

def perform_BTM(fpath, num_top):
    company_data = pd.read_excel(fpath)
    company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")]
    print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name)
    print("This is using %s topics" % num_top)
    
    #Remove retweets from the company account, as they aren't technically company account tweets
    patternDel = "^RT @"
    filter1 = company_data["Content"].str.contains(patternDel)
    company_tweets2 = company_data[~filter1].copy()
    
    ##Designate tweets as 'OT' or 'IRT' prior to removing more tweets or altering tweet contents
    #Perform initial separation based on "^@" regex:
    initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]]
    initOT = [not elem for elem in initIRT]
    #print(initOT)
    
    #Create IRT and OT variables in the data:
    company_tweets2["IRT"] = initIRT
    company_tweets2["OT"] = initOT
    
    
    #Fill in NAs under the 'In Reply To' field with "OT":
    company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace(np.nan, "OT", regex=True)
    #print(company_tweets["In Reply To"].head(5))
    
    #Call function to improve on initial OT vs. IRT splits:
    company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT", "In Reply To", "Author", "OT")
    
    #For this version, extract IRT tweets only:
    company_tweets = company_tweets3[company_tweets3["IRT"] == True].copy()
    #print(company_tweets.shape)
    #print("Break")    
    
    #Create column such that original tweet contents aren't totally lost after textual pre-processing
    company_tweets["Content2"] = company_tweets["Content"]
    
    #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents:
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes
    
    #Examine tweets after removing/replacing 'smart' apostrophes and quotes:
    #print(company_tweets["Content"].head(5))
    
    #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "")
    
    #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such):
    company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "")
    
    #Standardize the textual contents of tweets:
    textual_tweets = standardize_text(company_tweets, "Content")
    
    
    #Perform lemmatization on the textual contents of tweets:
    textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content")
    #print(textual_tweets["Content"].head(5))
    
    #Removing tweets that weren't originally in English
    English_tweets = textual_tweets[textual_tweets["Language"] == "en"]
    
    #Removing rows with no text left inside them
    filter1 = English_tweets["Content"] != ""
    cleanGlish_tweets = English_tweets[filter1]
    
    
    #Remove stop words from the data:
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    
        
    ##Expand on the initial set of stopwords:
    stop_words2 = pd.DataFrame(stop_words)
    stop_words2["Words"] = stop_words
    add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing
    
    #Add the newly created stopwords to the original set:
    for word in add_stopwords:
        if word not in stop_words:
            stop_words.add(word)
            
    #These words need to be added manually to the set of stopwords:
    stop_words.add("wed")
    stop_words.add("us")
    #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions)
    stop_words.add("u")
    
    
                
    
    ##Vectorize the cleaned tweets
    #from sklearn.feature_extraction.text import CountVectorizer
    
    #Filter out stopwords here:
    vec = CountVectorizer(stop_words=stop_words)
    
    #Tokenize tweet contents:    
    tokenizer = RegexpTokenizer(r'\w+')
        
    cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize)
    cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words)
    
    #print("Token differences:")
    #print(cleanGlish_tweets["tokens"].head(5))
    #print(cleanGlish_tweets["clean_tokens"].head(5))
    
    print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"]))
    #Filter out tweets with less than 3 words:
    cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]]
    cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy()
    print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"]))
    #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference
    #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]]
    #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy()
    #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"]))
    
    #print("Breakpoint")
    
    X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray()
    #print("X looks like:")
    #print(X)
    
    #Get the vocabulary and the biterms from the tweets:
    #from biterm.utility import vec_to_biterms, topic_summuary
    
    vocab = np.array(vec.get_feature_names())
    #print("Vocab is:")
    #print(vocab)
    biterms = vec_to_biterms(X)
    #print("Biterms look like:")
    #print(biterms)
    #print("The non-zero parameter we're passing looks like:")
    #print(np.count_nonzero(X, axis=1))
    #print("The sum parameter we're passing in looks like:")
    #print(np.sum(X, axis=0))
    #print("Breakpoint")
    
    
    #Create a BTM and pass the biterms to train it:
    #from biterm.btm import oBTM
    #import time
    start_time = time.time()
    
    #random.seed(1)
    btm = oBTM(num_topics=num_top, V=vocab)
    topics = btm.fit_transform(biterms, iterations=100)
    end_time = time.time()
    run_time = end_time - start_time
    print("For %s..." % company_name)
    print("BTM took %s seconds to train" % run_time)
    
    #print("First parameter:")
    #print(btm.phi_wz.T)
    #print("Topics:")
    #print(topics)
    
    ##See if formatting data in the following manner allows pyLDAvis.prepare to work:
    
    
    #Visualize the topics:
    #If HTML(vis) doesn't work, look at following link
    #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/
    #import pyLDAvis
    ##!This isn't working for some reason
    #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
    #pyLDAvis.display(vis)
    #pyLDAvis.show(vis)
    #from IPython.core.display import HTML
    #HTML(vis)
    #cleanGlish_tweets2["topic"] = topics.argmax()
    cleanGlish_tweets2["topic"] = [topics[i].argmax() for i in range(len(cleanGlish_tweets2["Content"]))]
    
    
    #print("\nTweets and Topics:")
    #for i in range(len(cleanGlish_tweets2["Content"])):
        #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax()))
    #    cleanGlish_tweets2.iat[i, 22] = topics[i].argmax()
    
    #Examine topic coherence scores:
    print("\nTopic Coherence:")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)
    
    #Save the tweet topics:
    respath2 = respath + str(company_name) + resEnding
    cleanGlish_tweets2.to_excel(respath2)