Beispiel #1
0
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x
                                for x in lda_model.show_topic(t, topn=5)])

######
import pyLDAvis.gensim
# print(pyLDAvis.gensim.prepare(lda_model, train_corpus, train_dictionary))
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, train_corpus,
                                          train_dictionary)
# pyLDAvis.show(LDAvis_prepared)

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=train_doc_list,
                                     dictionary=train_dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=train_doc_list,
                                     dictionary=train_dictionary,
                                     coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nUMass Coherence Score: ', coherence_lda)


def compute_coherence_values(dictionary,
                             corpus,
import pickle
import gensim
from gensim import corpora, models
from multiprocessing import freeze_support

with open('./pickle/dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)

import pandas as pd
processed_docs = pd.read_pickle("./pickle/processed_docs.pkl")
from gensim.models.coherencemodel import CoherenceModel

lda_model_mallet = gensim.models.wrappers.LdaMallet.load(
    './pickle/lda_model_mallet')

# Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
if __name__ == '__main__':
    freeze_support()
    coherence_model_lda_mallet = CoherenceModel(model=lda_model_mallet,
                                                texts=processed_docs,
                                                dictionary=dictionary,
                                                coherence='c_v')
    coherence_lda_mallet = coherence_model_lda_mallet.get_coherence()
    print('\nCoherence Score: ', coherence_lda_mallet)
Beispiel #3
0
def main():
    # adjust the path below to wherever you have the transcripts2018 folder
    document_list, file_name = load_data_from_dir(
        "ted-transcripts/transcripts/")
    print(len(document_list))
    # I've added extra stopwords here in addition to NLTK's stopword list - you could look at adding others.
    doc_clean = preprocess_data(document_list, {'laughter', 'applause'})

    dictionary, doc_term_matrix = prepare_corpus(doc_clean)

    number_of_topics = 0  # adjust this to alter the number of topics
    # words=20 #adjust this to alter the number of words output for the topic below

    # runs LDA using Mallet from gensim using the number_of_topics specified above - this might take a couple of minutes
    # you can create additional variables eg ldamallet to store models with different numbers of topics
    # ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)
    # gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
    # coherencemodel = CoherenceModel(model=gensimmodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    # coherence_lda = coherencemodel.get_coherence()
    # print('\nCoherence Score: ', coherence_lda)

    min_k = 5
    max_k = 15
    intervals = 5
    coherences = {}
    coherence_lda = {}
    max_coherence = 0
    for i in range(min_k, max_k, intervals):
        ldamallet = LdaMallet(mallet_path,
                              corpus=doc_term_matrix,
                              num_topics=i,
                              id2word=dictionary)
        gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
            ldamallet)
        coherences[i] = CoherenceModel(model=gensimmodel,
                                       texts=doc_clean,
                                       dictionary=dictionary,
                                       coherence='c_v')
        coherence_lda[i] = coherences[i].get_coherence()
        #identify best coherence score and save the model
        if coherence_lda[i] > max_coherence:
            max_coherence = coherence_lda[i]
            ldamalletbest = ldamallet
            gensimmodelbest = gensimmodel
            coherencebest = coherences[i]
            number_of_topics = i

    for k in coherence_lda:
        print('\nCoherence Score for topic count ', k, ':', coherence_lda[k])

    print('best coherence:', max_coherence)

    ldamalletbest.show_topics(num_topics=number_of_topics, num_words=20)
    ldamalletbest.print_topics()
    # convert the coherence scores to a pandas dataframe
    df = pd.DataFrame.from_dict(coherence_lda,
                                orient='index',
                                columns=['Coherence'])
    df['Topics'] = df.index

    # plot the result
    df.plot(kind='line', x='Topics', y='Coherence')
    plt.show()

    text_name = '2018-03-03-kriti_sharma_how_to_keep_human_biases_out_of_ai.txt'  #name of file need to be checked
    #text_name ='2012-09-14-timothy_bartik_the_economic_case_for_preschool.txt' #name of file need to be checked

    doc_id = file_name.index(text_name)  # index of document to explore
    print(file_name[3138])

    document_topics = gensimmodelbest.get_document_topics(
        doc_term_matrix[doc_id])  # substitute other models here
    document_topics = sorted(document_topics, key=lambda x: x[1],
                             reverse=True)  # sorts document topics

    model_doc_topics = gensimmodelbest.get_document_topics(
        doc_term_matrix)  # substitute other models here
    lda_index = similarities.MatrixSimilarity(model_doc_topics.corpus)

    # query for our doc_id from above
    similarity_index = lda_index[doc_term_matrix[doc_id]]
    # Sort the similarity index
    similarity_index = sorted(enumerate(similarity_index),
                              key=lambda item: -item[1])

    for i in range(1, 6):
        document_id, similarity_score = similarity_index[i]
        print('Document Index: ', document_id)
        print('Document Name: ', file_name[document_id])
        print('Similarity Score', similarity_score)
        print(re.sub('\s+', ' ', document_list[document_id][:500]),
              '...')  # preview first 500 characters
        print()
Beispiel #4
0
def job_comp_sims():
    texts = read_comp_doc()
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts.values())

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts.values()]

    # generate LDA model
    ldamodel = models.ldamodel.LdaModel(corpus,
                                        num_topics=40,
                                        id2word=dictionary,
                                        passes=50)
    #print(ldamodel.print_topics(num_topics=5))
    cm = CoherenceModel(model=ldamodel,
                        texts=texts.values(),
                        dictionary=dictionary,
                        coherence='c_v')
    print(cm.get_coherence())

    res_path = os.getcwd() + '/../results/lda/jobcomp/'
    if not os.path.exists(res_path):
        os.makedirs(res_path)
    res_file_path = res_path + "jobcomp.csv"
    res_file = Path(res_file_path)
    if res_file.is_file():
        os.remove(res_file_path)
    job_count = 0
    ### compute similarities for a few jobs from old jobs
    for filename in glob.glob(JOB_PATH + '*.json'):
        if job_count < 5:
            job_text = tokenize_clean(filename, 'json')
            job_bow = dictionary.doc2bow(job_text)
            job_lda = ldamodel[job_bow]
            index = similarities.MatrixSimilarity(ldamodel[corpus])
            sims = index[job_lda]
            comps = texts.keys()
            comp_count = 0
            with open(res_file_path, mode="a") as text_file:
                for doc, sim in enumerate(sims):
                    text_file.write(
                        str(job_count) + "," + str(comp_count) + "," +
                        comps[doc] + "," + str(sim) + "\n")
                    comp_count += 1
            job_count += 1
    ### compute similarities for new jobs
    for filename in glob.glob(NEWJOB_PATH + '*.json'):
        job_text = tokenize_clean(filename, 'json')
        job_bow = dictionary.doc2bow(job_text)
        job_lda = ldamodel[job_bow]
        index = similarities.MatrixSimilarity(ldamodel[corpus])
        sims = index[job_lda]
        comps = texts.keys()
        comp_count = 0
        with open(res_file_path, mode="a") as text_file:
            for doc, sim in enumerate(sims):
                text_file.write(
                    str(job_count) + "," + str(comp_count) + "," + comps[doc] +
                    "," + str(sim) + "\n")
                comp_count += 1
        job_count += 1
Beispiel #5
0
def find_best_model_cv(n_topic_range,
                       texts,
                       id2word,
                       corpus,
                       threshold=None,
                       random_state=42,
                       plot=True,
                       verbose=False):
    """
    Searches for the best model in a given range by C_v coherence value

    Parameters:
        - `n_topic_range`
            a range of values for the `num_topics` parameter of a gensim LDA model to try
        - `texts`
            a list of documents broken into words
        - `id2word`
            a dictionary containing word encodings
        - `corpus`
            the result of mapping each word in `texts` to its value in `id2word`
        - `random_state` 
            a random state for use in a gensim LDA model
        - `threshold`
            a float that specifies a coherence value that if reached will cause the function to return early
        - `plot`
            a boolean specifying whether or not to plot coherence values against each `num_topics` value
        - `verbose`
            a boolean specifying whether or not to print updates
    
    Returns: a tuple containing the best model, the list of all models attempted, and a list of all coherence values obtained, respectively.
    """
    models = []
    coherence_vals = []

    for n_topics in n_topic_range:

        # Print percentage progress
        if verbose:
            diff = max(n_topic_range) - n_topic_range.start
            print(
                str(round(100 * (n_topics - n_topic_range.start) / diff, 1)) +
                "% done")

        lda_model = LdaModel(corpus=corpus,
                             id2word=id2word,
                             num_topics=n_topics,
                             random_state=random_state,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)
        co_model = CoherenceModel(lda_model,
                                  texts=texts,
                                  dictionary=id2word,
                                  coherence="c_v")
        coherence = co_model.get_coherence()

        models.append(lda_model)
        coherence_vals.append(coherence)

        if threshold is not None and coherence > threshold:
            if verbose:
                print('Returning early with a coherence value of ' +
                      str(coherence))

            if plot:
                actual_range = range(n_topic_range.start,
                                     n_topics + n_topic_range.step,
                                     n_topic_range.step)
                plt.plot(actual_range, coherence_vals, 'b')
                plt.show()

            return lda_model, models, coherence_vals

    if plot:
        # The portion of the range that was actually iterated through
        plt.plot(n_topic_range, coherence_vals, 'b')
        plt.show()

    return models[np.argmax(coherence_vals)], models, coherence_vals
#%%Topic Modeling
from sklearn.decomposition import TruncatedSVD
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import sent_tokenize, word_tokenize
# SVD represent documents and terms in vectors
svd_model = TruncatedSVD(n_components=20,
                         algorithm='randomized',
                         n_iter=100,
                         random_state=122)

svd_model.fit(X)

len(svd_model.components_)

text = sent_tokenize(''.join(list(data['processed_data'].values)))

coherencemodel = CoherenceModel(model=svd_model, texts=text)

#%%
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

m1 = LdaModel(common_corpus, 3, common_dictionary)
m2 = LdaModel(common_corpus, 5, common_dictionary)

cm = CoherenceModel.for_models([m1, m2],
                               common_dictionary,
                               corpus=common_corpus,
                               coherence='u_mass')
    # read topics for evaluating
    Weeks = "W10 W10W11 W11W12 W12W13 W13W14 W14W15 W15W16 W16W17 W17W18 W18W19 W19W20 W20W21 W21W22 W22W23 W23W24 W24W25 W25W26".split(
        " ")
    topics = []
    for week in Weeks:
        summary_file = f"{model_path}{week}{summary_suffix}"
        df = pd.read_csv(summary_file)
        list_relevant_words = df.relevant_words.to_list()
        for each_relevant_words in list_relevant_words:
            topics.append(each_relevant_words.split(" "))

    # test block
    # coherences = []
    # for week in Weeks:
    #     coherences += [week[-2:]]*num_topics

    r_corpus = MmCorpus(output_fname)
    with Timer():
        cm = CoherenceModel(topics=topics, corpus=r_corpus,\
             dictionary=dct, coherence='u_mass')
        coherences = cm.get_coherence_per_topic()

    # write results
    wf = open("./coherence-evals.txt", "w")
    for i, week in enumerate(Weeks):
        wf.write(f"{week}\n")
        cs = coherences[i * num_topics:(i + 1) * num_topics]
        for c in cs:
            wf.write(f"{c}\n")
    wf.close()
Beispiel #8
0
    #this is where you will start processing the data and apply LDA technique
    """
    
    documents = data_df['hsp_account_id'].tolist() 
    dictionary = gensim.corpora.Dictionary(documents)
    dictionary.filter_extremes(no_below = 190, no_above = 0.7)
    dictionary.save("saveCorporaDicV4_20_250")
    corpus = [dictionary.doc2bow(text) for text in documents]
    
    #this is where you will apply the actual model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = dictionary, passes = 250)

    ldamodel.save('OpioidLdamodelV4_20_250.lda')
    lda = gensim.models.ldamodel.LdaModel.load('SUD_20_250_70.lda')

    cm = CoherenceModel(model = lda, texts = documents, corpus = corpus, coherence = 'c_v')
    coherence = cm.get_coherence()

    ldaP = lda.print_topics(num_topics = 20, num_words = 25)
   
    print(coherence)    
    


    
    




    
######################### Prepared ####################
#convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

#to have logs
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


################# LDA #####################
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20,id2word=dictionary, update_every=1, chunksize=100, passes=1)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2,id2word=dictionary, minimum_probability=0.01)
ldamodel.print_topics(num_topics = 2, num_words = 20)


cm = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided.
print cm.get_coherence()

#print "...........................get__document_topics....................."

#print ldamodel.get_document_topics(corpus[100], minimum_probability=None)

# ################  NMF #####################

# model = NMF(n_components=2, init='random', random_state=0)
# model.fit(dictionary)
# NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
#   n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
#   solver='cd', sparseness=None, tol=0.0001, verbose=0) 

Beispiel #10
0
    corpus.append(dictionary.doc2bow(tokens))
print("Corpus and dictionary created!")

for num_topic in num_topics:
    print("Number of topics: ", num_topic)
    ldamodel = LdaModel(corpus=corpus,
                        num_topics=num_topic,
                        id2word=dictionary,
                        iterations=20)
    topics = ldamodel.print_topics(num_words=10)

    num_topic_to_topics[num_topic] = topics
    num_topic_to_models[num_topic] = ldamodel

    coherence_model_lda = CoherenceModel(model=ldamodel,
                                         corpus=corpus,
                                         dictionary=dictionary,
                                         coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score (U_Mass): ', coherence_lda)

pickle.dump(num_topic_to_models, open("num_topic_to_models.dict", "wb"))
"""
Number of topics:  20
Coherence Score (U_Mass):  -0.42236587843071566

Number of topics:  50
Coherence Score (U_Mass):  -0.5597070191943092

Number of topics:  100
Coherence Score (U_Mass):  -0.6293096290358676
--------------------------------------------------------------------------
	def coherence(self, corpus):
		coherence_model = CoherenceModel(model=self.lda, texts=corpus.tokens, 
			dictionary=corpus.dictionary, coherence='c_uci')
		return coherence_model.get_coherence()
Beispiel #12
0
print()

print("[ %s ]Showing Topics:" % (time.asctime(time.localtime(time.time()))))
for topic in good_lda.show_topics():
    print(topic)
print()

for topic in bad_lda.show_topics():
    print(topic)
print()

#pyLDAvis.enable_notebook()
good_vis = pyLDAvis.gensim.prepare(good_lda, corpus, dictionary)
pyLDAvis.save_html(good_vis, './html/good_lda_' + str(args.topics) + '.html')
good_cm = CoherenceModel(model=good_lda,
                         corpus=corpus,
                         dictionary=dictionary,
                         coherence='u_mass')
print("good_lda u_mass : ", good_cm.get_coherence())
good_cm = CoherenceModel(model=good_lda,
                         texts=texts,
                         dictionary=dictionary,
                         coherence='c_v')
print("good_lda c_v : ", good_cm.get_coherence())

bad_vis = pyLDAvis.gensim.prepare(bad_lda, corpus, dictionary)
pyLDAvis.save_html(bad_vis, './html/bad_lda_' + str(args.topics) + '.html')
bad_cm = CoherenceModel(model=bad_lda,
                        corpus=corpus,
                        dictionary=dictionary,
                        coherence='u_mass')
print("bad_lda u_mass : ", bad_cm.get_coherence())
    def train_model(self, topic_docs, num_topics, model_name, blnSaveinDB=False, blnSaveTrainedModelFiles=False, txtFileName=None,
                    model_type='both', lda_num_of_iterations=150, delete_stop_words=True, lemmatize_words=True, delete_numbers=True):
        
        #starttime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #print("Executing train_model... Started at: " + starttime )        

        doc_clean = [self.clean_docs(doc, delete_numbers, delete_stop_words, lemmatize_words).split() for doc in topic_docs]

        # Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
        self.dictionary = corpora.Dictionary(doc_clean)

        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        self.doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in doc_clean]

        # Creating the object for LDA model using gensim library
        Lda = gensim.models.ldamodel.LdaModel

        
        file_data = []        
        
        if model_type in ('lda', 'both'):
            # Build the LDA model
            self.lda_model = gensim.models.LdaModel(corpus=self.doc_term_matrix, num_topics=num_topics, id2word=self.dictionary, iterations=lda_num_of_iterations)                            
            #get LDA coherence
            self.lda_coh_u_mass = CoherenceModel(model=self.lda_model, corpus=self.doc_term_matrix, dictionary=self.dictionary, coherence='u_mass') 
            self.lda_coh_c_v = CoherenceModel(model=self.lda_model, texts=doc_clean, dictionary=self.dictionary, coherence='c_v')
            
            #create json file with lda results
            for idx in range(num_topics):                
                topic = idx+1
                strtopic = str(topic)
                data = '{"model_name":"' + model_name + \
                        '", "model_type":"' + 'lda' + \
                        '", "timestamp":"' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \
                        '", "no_tweets":"' + str(len(topic_docs)) + \
                        '", "coh_u_mass":"' + str(self.lda_coh_u_mass.get_coherence()) + \
                        '", "coh_c_v":"' + str(self.lda_coh_c_v.get_coherence()) + \
                        '", "topic_no":"' + strtopic + \
                        '", "topic":"' + str(self.lda_model.print_topic(idx, num_topics)).replace('"', "-") + '"}'
                x = json.loads(data)
                file_data.append(x)
            
                
        if model_type in ('lsi', 'both'):
            # Build the LSI model
            self.lsi_model = gensim.models.LsiModel(corpus=self.doc_term_matrix, num_topics=num_topics, id2word=self.dictionary)    
            #get LSI coherence
            self.lsi_coh_u_mass = CoherenceModel(model=self.lsi_model, corpus=self.doc_term_matrix, dictionary=self.dictionary, coherence='u_mass') 
            self.lsi_coh_c_v = CoherenceModel(model=self.lsi_model, texts=doc_clean, dictionary=self.dictionary, coherence='c_v')
        
            #create json file with lsi results
            for idx in range(num_topics):
                topic = idx+1
                strtopic = str(topic)
                data = '{"model_name":"' + model_name + \
                        '", "model_type":"' + 'lsi' + \
                        '", "timestamp":"' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \
                        '", "no_tweets":"' + str(len(topic_docs)) + \
                        '", "coh_u_mass":"' + str(self.lsi_coh_u_mass.get_coherence()) + \
                        '", "coh_c_v":"' + str(self.lsi_coh_c_v.get_coherence()) + \
                        '", "topic_no":"' + strtopic + \
                        '", "topic":"' + str(self.lsi_model.print_topic(idx, num_topics)).replace('"', "-") + '"}'
                x = json.loads(data)
                file_data.append(x)


        # Save if mongoDB collection is asked
        if blnSaveinDB == True:        
            if self.db  is not None:
                self.c_topics.insert_many(file_data)                
            else:
                print("Can't save topics in db. No mongoDB connection was set up.")
                    
        # Save results in a text file
        if txtFileName is not None:
            with open(txtFileName, 'w', encoding="utf-8") as outfile:
                json.dump(file_data, outfile)
    

            
            
        # Save models into file
        if blnSaveTrainedModelFiles == True:
            
            #creates path if does not exists
            if not os.path.exists(self.folder_path + "/trained_models/"):
                os.makedirs(self.folder_path + "/trained_models/")
            
            self.lda_model.save(self.folder_path + "/trained_models/" + model_name + "_lda_model.model")
            self.dictionary.save(self.folder_path + "/trained_models/" + model_name + "_dictionary.dict")