Beispiel #1
0
def train_classifier(papers: list, num_topics: int) -> LdaModel:
    """
    Trains the Lda model with selected documents.
    Training is done by cleaning the documents, index the words
    and train the model with a given number of topics
    Args:
        papers: list of papers, each item containing the corpus of a document
        num_topics: amount of topics that need to be trained

    Returns: Trained lda model

    """
    papers_clean = [clean(paper) for paper in papers]
    dictionary = corpora.Dictionary(papers_clean)
    doc_term_matrix = [dictionary.doc2bow(paper) for paper in papers_clean]
    models = []
    print("Start generating models")
    for x in range(13, 14):
        ldamodel = LdaMulticore(doc_term_matrix, num_topics=x, id2word=dictionary, passes=50)
        topic_words = [w[0] for x in range(ldamodel.num_topics) for w in ldamodel.show_topic(x)]
        unique_words = set(topic_words)
        models.append(ldamodel)
        print(x, len(unique_words), len(unique_words)/float(len(topic_words)))
    x = 1
    while True:
        try:
            x = int(input("Enter the model you want to train labels for:\n"))
        except ValueError:
            print("not an integer")
            continue
        if x > len(models) or x < 1:
            print("Model does not exist")
        else:
            break
    return models[x-1], dictionary
    def generate_lda_topics(self):
        from gensim.corpora import Dictionary, MmCorpus
        from gensim.models.ldamulticore import LdaMulticore
        import pyLDAvis
        import pyLDAvis.gensim
        import warnings
        import _pickle as pickle

        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        trigram_dictionary = Dictionary(trigram_sentences)
        # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
        trigram_dictionary.compactify()
        trigram_dictionary.save(self.trigram_dictionary_filepath)

        def trigram_bow_generator(filepath):
            for sentence in LineSentence(filepath):
                yield trigram_dictionary.doc2bow(sentence)

        MmCorpus.serialize(
            self.trigram_bow_filepath,
            trigram_bow_generator(self.trigram_sentences_filepath))
        trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=3,
                               id2word=trigram_dictionary,
                               workers=3)
            lda.save(self.lda_model_filepath)
        lda = LdaMulticore.load(self.lda_model_filepath)
        lda.show_topic(0)
        lda.show_topic(1)
        lda.show_topic(2)
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                                  trigram_dictionary)
        pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
Beispiel #3
0
    def perform(self, option="load"):
        """
        Perform LDA analysis to generate topics
        and topic distribution for each app
        """
        logging.info("Start Lda analysis")

        ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration)
        logging.info("LDA multicore modeling done")

        ldamodel.save(self.lda_out_file_name)

        self.topics = {}
        for i in range(0, self.ntopic, 1):
            self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword)
            logging.info("Topic{}".format(i))
            words = [w[1] for w in self.topics["topic{}".format(i)]]
            logging.info(words)
Beispiel #4
0
def LDA_Machine(lst_dict, handle_lst):
    assert type(lst_dict) == list, "Please enter a list of dictionary's"
    assert type(handle_lst) == list, "Please enter a list of handles"

    file_path_corpus = "/home/igabr/new-project-4/mm_corpus/"

    cnt_1 = -1
    cnt_2 = -1

    for handle in handle_lst:
        cnt_1 += 1

        clean_tweet_list = []

        handle_tweets = lst_dict[cnt_1][handle]['content']

        if handle_tweets == []:
            continue
        else:
            for raw_tweet in handle_tweets:

                clean_tweet = ""

                tokenized_tweet = nlp(raw_tweet)

                for token in tokenized_tweet:
                    if token.is_space:
                        continue
                    elif token.is_punct:
                        continue
                    elif token.is_stop:
                        continue
                    elif token.is_digit:
                        continue
                    elif len(token) == 1:
                        continue
                    elif len(token) == 2:
                        continue
                    else:
                        clean_tweet += str(token.lemma_) + " "

                clean_tweet_list.append(clean_tweet)
            clean_tweet_list = list(map(str.strip, clean_tweet_list))
            clean_tweet_list = [x for x in clean_tweet_list if x != ""]
            lst_dict[cnt_1][handle]['tokenized_tweets'] = clean_tweet_list
            print("{} tokenized_tweets inserted!".format(handle))
            print()

    master_df = make_df(lst_dict)

    to_remove = list(master_df[master_df['tokenized_tweets'].isnull()].index)

    index_to_remove = []
    for i in to_remove:
        index_to_remove.append(handle_lst.index(i))

    new_handle_list = [
        v for i, v in enumerate(handle_lst)
        if i not in frozenset(index_to_remove)
    ]

    master_df.dropna(subset=['tokenized_tweets'], inplace=True)

    master_df = filtration(master_df, "tokenized_tweets")

    clean_lst_dict = dataframe_to_dict(master_df)
    print()
    print("Cleaning of master dataframe complete!")

    for handle in new_handle_list:
        cnt_2 += 1

        try:
            list_of_tweets = clean_lst_dict[cnt_2][handle]['tokenized_tweets']
        except KeyError:
            continue

        gensim_format_tweets = []

        for tweet in list_of_tweets:
            list_form = tweet.split()
            gensim_format_tweets.append(list_form)

        gensim_dictionary = Dictionary(gensim_format_tweets)
        gensim_dictionary.filter_extremes(no_below=10, no_above=0.4)
        gensim_dictionary.compactify(
        )  # remove gaps after words that were removed

        MmCorpus.serialize(
            file_path_corpus + "{}.mm".format(handle),
            bag_of_words_generator(gensim_format_tweets, gensim_dictionary))

        corpus = MmCorpus(
            file_path_corpus +
            "{}.mm".format(handle))  #loading the corpus from disk

        if corpus.num_terms == 0:
            continue
        else:
            lda = LdaMulticore(corpus,
                               num_topics=10,
                               id2word=gensim_dictionary,
                               passes=100,
                               workers=100)
            lda.save(file_path_corpus + "lda_model_{}".format(handle))
            print("LDA model for {} saved!".format(handle))

            word_list = []

            for i in range(10):
                for term, frequency in lda.show_topic(i, topn=100):
                    if frequency != 0:
                        word_list.append(term)

            LDA_Counter = Counter(word_list)

            clean_lst_dict[cnt_2][handle]['LDA'] = LDA_Counter
            print("Inserted LDA Counter into {} dictionary".format(handle))

    pickle_object(clean_lst_dict, "2nd_degree_connections_LDA_complete")
    print("Script Complete")
Beispiel #5
0
class recommendationsys_LDA:
    def __init__(self, ngram):
        # load the spacy english model
        self.nlp = spacy.load('en')
        
        self.extrawords = ["'s", "st", "th", "’s", "-PRON-", "’", "htt", "ht", "km", "pm", "am"]
        
        # parse the latest emoji code
        html = str(ur.urlopen('http://www.unicode.org/Public/emoji/5.0/emoji-data.txt').read())
        codes=list(map(lambda x: '-'.join(['\\U'+a.zfill(8) for a in x.split('..')]).encode().decode('unicode-escape'),re.findall(r'(?<=\\n)[\w.]+',html)))
        self.emojiPattern = re.compile('['+','.join(codes)+']',flags=re.UNICODE)
        
        PROJECT_DIRECTORY = 'output/project/' + project_name

        self.f_titles = PROJECT_DIRECTORY + '/titlesLF_target.txt'

        self.f_authors = PROJECT_DIRECTORY + '/authors_target.txt'
        
        self.authorcontent_clean = {}
        
        self.ngram_bow_corpus = []
        
        self.ldavec = {}
        
        self.ngram_dictionary = None
        
        self.ngram = ngram
        self.num_topics = None

    def clean_text(self, text):
    
        # remove the 'RT' and replace '\n' to '.' 
        text = text.lower()
        #text = text.replace('RT',' ')
        text = text.replace('\n',' . ')    
    
        # this is for USC-2
        # remove emojis
        myre = re.compile(u'('
                         '@\S*\s?|#|'   # remove @ mention names and hastag sign
                         'http[s]?[:…]+\S*\s|' # remove url
                         '[-~\^\$\*\+\{\}\[\]\\\|\(\)/“"]|'
                         'rt[:]? |'
                         '…'
                         ')+', re.UNICODE)

        text = myre.sub(' ', text)
        text = self.emojiPattern.sub(' ', text)

        text = text.replace('&amp;','and')
        
        
        
                
    

        #text = ' '.join(text)
        


        
        return text


#---------------------------
# make the recommendations
#---------------------------
    def recomendation(self, username, topicn=0, list=[]):
        
        similaritis = self.ldacosinesimilarity(username,topicn)
        result=[]
        # list is empty, run on the whole dataset
        if not list:
            for key, value in sorted(similaritis.items(), key=lambda x:x[1]):
                result.append((key,value))
        else:
            for i in list:
                result.append((i,similaritis[i]))
            
            # sort the result by similarities
            result = sorted(result, key=lambda x:x[1])

#---------------------------
# load and clean the data
#---------------------------
    def loadandclean(self, n=-1):

        #authorcontent = {}

        # ------
        with codecs.open(self.f_titles, encoding='utf_8') as f_t:
            with codecs.open(self.f_authors, encoding='utf_8') as f_a:
                for l_a, l_t in zip(f_a, f_t):
                    # remove the '\n' at the end
                    key = l_a[:-1].lower()
            
                    l_t = self.clean_text(l_t)
                    if key in self.authorcontent_clean:
                        
                        self.authorcontent_clean[key].append(l_t)
                        #self.authorcontent_clean[key] = self.clean_text(value)
                    else:
                        
                        self.authorcontent_clean[key] = [l_t]
                        #self.authorcontent_clean[key] = self.clean_text(value)
                    
                    if n != -1 and len(self.authorcontent_clean) == n:
                        break
        # ---------------                
        

        for key, value in self.authorcontent_clean.items():
           self.authorcontent_clean[key] = self.lemmatized_sentence_corpus(self.authorcontent_clean[key])


    
#------------------------------------------------------
# build the trigram content based on the clean content
#------------------------------------------------------
    
    def punct_space(self, token):
        """
        helper function to eliminate tokens
        that are pure punctuation or whitespace
        """
        #return token.pos_ == 'NOUN' or token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2
        return token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2

    def lemmatized_sentence_corpus(self, contents):
        """
        generator function to use spaCy to parse reviews,
        lemmatize the text, and yield sentences
        """
        sentents = []
    
        for content in self.nlp.pipe(contents,batch_size=500, n_threads=8):
        
            for sent in content.sents:
                #sentents.append(u' '.join([token.lemma_ for token in sent
                #                 if not punct_space(token)]))
                #sentents.append([token.lemma_ for token in sent
                #                 if not punct_space(token)])
                tokens = []
                for token in sent:
                    if self.punct_space(token):
                        continue
                
                    #if token.lemma_ == '-PRON-':
                    #    token.lemma_ = token.lower_
                    tokens.append(token.lemma_)
                
                sentents.append(tokens)
                    
        return sentents

    """
    prepare the parameters for lda
    """
    def ldainit(self):
        
#        self.num_topics = num_topics
#        ngram = self.ngram
#        # if ngram_bow_corpus is empty, build it first
#        if not self.ngram_bow_corpus: 
        
        self.user_sentences = self.authorcontent_clean
        self.user_bigramsentences = {}
        self.all_sentences = []
        self.all_bigram_sentences = []
        
        sentences = list(self.authorcontent_clean.values())
        self.all_sentences = [item for sublist in sentences for item in sublist]
        
        # buld bigram model 
        if self.ngram == 2:
            self.bigram_model = Phrases(self.all_sentences)
            for user,content in self.user_sentences.items():
                bigram_s = []
                for s in content:
                    bigram_s.append(self.bigram_model[s])
                self.user_bigramsentences[user] = bigram_s
                self.all_bigram_sentences += self.user_bigramsentences[user]
                
            
            
    def trainlda(self, topics_n = 10):
        self.num_topics = topics_n
        
        alltexts = []
        for name,sentences in self.user_sentences.items():
            sentences = [item for sublist in sentences for item in sublist]
            alltexts.append(sentences)
        
        
#        if self.ngram_dictionary == None:
#            if self.ngram == 1:
#                self.ngram_dictionary = Dictionary(self.all_sentences)
#            elif self.ngram == 2:
#                self.ngram_dictionary = Dictionary(self.all_bigram_sentences)
#                
        if self.ngram_dictionary == None:
            if self.ngram == 1:
                self.ngram_dictionary = Dictionary(alltexts)
            elif self.ngram == 2:
                self.ngram_dictionary = Dictionary(alltexts)
                
            # filter tokens that are very rare or too common from
            # the dictionary (filter_extremes) and reassign integer ids (compactify)
            self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8)
            self.ngram_dictionary.compactify()


#        if self.ngram == 1:
#            sentences = self.all_sentences
#        elif self.ngram == 2:
#            sentences = self.all_bigram_sentences
            
#        ngram_bow_corpus = []
#        for sentence in sentences:
#            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))
#
#
#        self.lda = LdaMulticore(ngram_bow_corpus,
#                           num_topics = topics_n,
#                           id2word=self.ngram_dictionary,
#                           workers=3)
        

            
        ngram_bow_corpus = []
        for sentence in alltexts:
            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))


        self.lda = LdaMulticore(ngram_bow_corpus,
                           num_topics = topics_n,
                           id2word=self.ngram_dictionary,
                           workers=3)    
        
        
                # calculate the cohe
        topics=[]

        for i in range(self.lda.num_topics):
            terms = []
            for n in self.lda.show_topic(i):
                terms.append(n[0])
            topics.append(terms)
        
        cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass')
        cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v')
        cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci')
        cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi')

        return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()

        
    def explore_topic(self, topic_number, topn=25):
        """
        accept a user-supplied topic number and
        print out a formatted list of the top terms
        """
        
        print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

        for term, frequency in self.lda.show_topic(topic_number, topn):
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))



    def runlda(self, username):
        
        if self.ngram == 1:
            user_sentences = self.user_sentences[username]
        elif self.ngram == 2:
            user_sentences = self.user_bigramsentences[username]
        
        # flat the list of list into single list
        user_sentences = [item for sublist in user_sentences for item in sublist]
        user_bow = self.ngram_dictionary.doc2bow(user_sentences)

        user_lda = self.lda[user_bow]

        #user_lda = sorted(user_lda, key=lambda x:-x[1])
        
        return user_lda

    """
    compute the lda topic vec for every one 
    """
    def runldavec(self):
        if not self.ldavec:
            for key, value in self.user_sentences.items():
                vec = np.zeros(self.num_topics)
                result = self.runlda(key)
                for i in result:
                    vec[i[0]] = i[1]
                self.ldavec[key] = vec
                
            
    """
    """
    def runtopntopic(self, n):
        self.topntopics = []
        
        for key, value in self.ldavec.items():
            idx = value.argsort()
                
            self.topntopics += list(idx[-n:])
        
        self.topntopics = list(set(self.topntopics))
    
    """
    compute the lda cosine similarity between a given user and the rest users
    """
    def ldacosinesimilarity(self, username, topn=0):
        if username not in self.authorcontent_clean:
            print('The user cannot find')
            return
        if topn < 0:
            print('topn should be >= 0')
            return
        
        topn = int(topn)
        
        cosinesimilaritydic = {}
        
        if not self.ldavec:
            self.runldavec()
        
        if topn == 0:
            usertopicvec = self.ldavec[username]
        else:
            self.runtopntopic(topn)
            usertopicvec = self.ldavec[username][self.topntopics]
            
        for key, value in self.ldavec.items():
            if key != username:
                if topn == 0:
                    pairtopicvec = value
                else:
                    pairtopicvec = value[self.topntopics]
                cosinesimilarity = pairwise_distances(np.array(usertopicvec).reshape(1,-1),np.array(pairtopicvec).reshape(1,-1), metric='cosine')[0][0]
                cosinesimilaritydic[key] = cosinesimilarity
                
        return cosinesimilaritydic
Beispiel #6
0
                           num_topics=6,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

def explore_topic(topic_number, topn=6):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
    
    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (f"{term:20} : {frequency:.3f}")

explore_topic(topic_number=3)

print("done")





# =========from earlier =========
# dictionary = corpora.Dictionary(text_data)
# corpus = [dictionary.doc2bow(text) for text in text_data]
# import pickle
# pickle.dump(corpus, open('corpus.pkl', 'wb'))
Beispiel #7
0
class LdaRecsys:
    def __init__(self):
        self.user_corp = {}
        self.corp_dict = {}
        self.corp_bow = []
        self.user_profile = UserProfile()
        self.user_profile.getUserList()
        self.user_list = self.user_profile.user_list
        self.cosine_distance_dict = {}
        self.lda_vect_dict = {}

    def loadCorpDict(self, user_list=[]):
        return

    def saveCorpDict(self):
        return

    def buildCorpDict(self, user_list=[], no_blow_doc=10, no_above_doc=0.5):

        if not user_list:
            if not self.user_list:
                print('user list is empty! will run getUserList')
                self.user_profile.getUserList()
                self.user_list = self.user_profile.user_list
                print('get the user list')

            user_list = self.user_list

        if not user_list:
            print('user list is still empty!')
            return []

        self.user_corp = self.user_profile.getCorpProfile(self.user_list)

        self.corp_dict = Dictionary(
            [corp for corp in list(self.user_corp.values())])

        # Keep tokens which are contained in at least no_below documents
        # Keep tokens which are contained in no more than no_above documents
        # (fraction of total corpus size, not an absolute number)
        self.corp_dict.filter_extremes(no_below=no_blow_doc,
                                       no_above=no_above_doc)
        self.corp_dict.compactify()

    def buildCorpBow(self):
        if not self.user_corp or not self.corp_dict:
            self.buildCorpDict()

        self.corp_bow = {}
        for user, corp in self.user_corp.items():
            self.corp_bow[user] = self.corp_dict.doc2bow(corp)

    def saveCorpBow(self):
        return

    def loadCorpBow(self):
        return

    def trainLDA(self, topics_num, iter_num=50):
        # reset the cosine_distance_dict in every training
        self.cosine_distance_dict = {}

        self.topics_num = topics_num
        corp_bow = [bow for bow in list(self.corp_bow.values())]
        self.lda = LdaMulticore(corp_bow,
                                num_topics=topics_num,
                                id2word=self.corp_dict,
                                iterations=iter_num,
                                workers=4)

    def runLDA(self, user_name):
        if user_name in self.corp_bow:
            user_bow = self.corp_bow[user_name]
        else:
            print('no such user! Please check the screen name')
            return

        user_lda = self.lda[user_bow]

        return user_lda

    def buildLdaVect(self):

        for user, bow in self.corp_bow.items():
            vect = np.zeros(self.topics_num)
            user_lda = self.lda[bow]

            for i in user_lda:
                vect[i[0]] = i[1]

            self.lda_vect_dict[user] = vect

    def ldaCosineDistance(self, user_name):

        if not self.lda_vect_dict:
            self.buildLdaVect()

        if user_name not in self.lda_vect_dict:
            print('no such user')
            return

        cosine_distance_dict = {}
        user_vect = self.lda_vect_dict[user_name]

        for user, lda_vect in self.lda_vect_dict.items():
            cosine_distance = pairwise_distances(
                np.array(user_vect).reshape(1, -1),
                np.array(lda_vect).reshape(1, -1),
                metric='cosine')[0][0]
            cosine_distance_dict[user] = cosine_distance

        self.cosine_distance_dict[user_name] = cosine_distance_dict

    def makeRecommendation(self, user_name, topn_recommendation=10):

        if user_name not in self.cosine_distance_dict:
            self.ldaCosineDistance(user_name)

        user_recommendations = self.cosine_distance_dict[user_name]

        n = 0
        for recommendation, cosine_distance in sorted(
                user_recommendations.items(), key=lambda x: x[1]):
            print((recommendation, cosine_distance))
            n += 1
            if n == topn_recommendation:
                break

        return user_recommendations

    def showTopic(self, topic_number, topn_word=5):
        """
        topic_numer:
            which topic to show
        topn_word:
            show top n words in this topic
        """

        print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

        for term, frequency in self.lda.show_topic(topic_number, topn_word):
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

    def showUserTopic(self, user_name, topn_word=10):
        if user_name not in self.corp_bow:
            print('no such user! please check the screen name')
            return

        user_bow = self.corp_bow[user_name]

        user_lda = self.lda[user_bow]

        user_lda = sorted(user_lda, key=lambda x: -x[1])

        for topic_number, freq in user_lda:

            print('topic number {}  {}'.format(topic_number, freq))
            print('|____')
            self.showTopic(topic_number, topn_word)
            print('\n')
Beispiel #8
0
# generate LDA model
my_num_topics = 30
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
ldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1)

print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5))
print(corpus[0])
print(corpus[1])
print(corpus[2])
print(ldamodel[corpus[0]])
print(ldamodel[corpus[1]])
print(ldamodel[corpus[2]])

# print(ldamodel.print_topics(20))

model_basename = '/home/osboxes/w/wlda/trymodel'
ldamodel.save(model_basename)

for t in range(ldamodel.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(ldamodel.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()
    # plt.pause(0.0001)





Beispiel #9
0
# create a stram of sentances for corpus dict
input_data = [sent.split(' ') for sent in data['text_clean_bigram'].tolist()]
# topic model with LDA
id2word = Dictionary(input_data)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in input_data]
# number of topics
num_topics = 10
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         workers=2)

# topics don't work well as tweets all relate to disasters
lda_model.show_topic(topicid=0, topn=25)
lda_model.show_topic(topicid=1, topn=25)
lda_model.show_topic(topicid=2, topn=25)
lda_model.show_topic(topicid=3, topn=25)

# predict of single string
string_input = data['text_clean_bigram'][1]
lda_topic_prob(string_input, input_data, lda_model)

# predict for all strings
# takes ~20 minutes to run
data['lda_topic_prob'] = data['text_clean_bigram'].apply(
    lambda x: lda_topic_prob(x, input_data, lda_model))
# transform the lda topic probabilities into a dataframe representation
# takes a while to run
lda_topic_df = topic_df(data=data,