Esempio n. 1
0
def run_lda(corpus,
            dictionary,
            texts,
            num_topics=10,
            passes=20,
            iterations=100):
    eval_frame = pd.DataFrame(columns=[
        'Num_Topics', 'Log_Perplexity_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(u_mass)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_uci)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_v)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_npmi)_P_{0}_I_{1}'.format(passes, iterations)
    ])
    logging.debug('******* RUNNING LDA *************')
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             passes=passes,
                             iterations=iterations,
                             chunksize=2500)
    coh_model_umass = CoherenceModel(model=lda_model,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence='u_mass')
    coh_model_uci = CoherenceModel(model=lda_model,
                                   texts=texts,
                                   coherence='c_uci')
    coh_model_ucv = CoherenceModel(model=lda_model,
                                   texts=texts,
                                   coherence='c_v')
    coh_model_npmi = CoherenceModel(model=lda_model,
                                    texts=texts,
                                    coherence='c_npmi')
    eval_frame.loc[len(eval_frame)] = [
        num_topics,
        lda_model.log_perplexity(corpus),
        coh_model_umass.get_coherence(),
        coh_model_uci.get_coherence(),
        coh_model_ucv.get_coherence(),
        coh_model_npmi.get_coherence()
    ]
    model = namedtuple('model', ['lda_model', 'eval_frame'])
    return model(lda_model, eval_frame)
Esempio n. 2
0
def learn_lda(corpus=None, dictionary=None, num_topics = NUM_TOPICS, passes = PASSES, 
				iterations = ITERATION):
	print("\nLDA Training...\n")

	ldamodel = LdaMulticore(
					corpus,
					num_topics = num_topics,
					id2word = dictionary,
					passes = passes,
					workers = WORKERS,
					iterations = iterations
				)
	print("\nLDA Training Done!\n")
	print("\nCoherence | Perplexity computing...\n")

	cm = CoherenceModel(model=ldamodel, corpus=corpus, coherence='u_mass')
	coherence = cm.get_coherence()
	perplexity = ldamodel.log_perplexity(corpus)

	return ldamodel, coherence, perplexity
Esempio n. 3
0
def make_ldamodels(pre_processed, max=6):

    perplex_coherence = []
    dictionary = corpora.Dictionary(pre_processed)
    corpus = [dictionary.doc2bow(text) for text in pre_processed]

    for num in range(5, max + 1):
        model = LdaMulticore(corpus,
                             num_topics=num,
                             id2word=dictionary,
                             passes=30,
                             random_state=1)
        coherence_model = CoherenceModel(model=model,
                                         texts=pre_processed,
                                         dictionary=dictionary,
                                         coherence='c_v')
        perplex_coherence.append((num, model.log_perplexity(corpus),
                                  coherence_model.get_coherence()))

    for val in perplex_coherence:
        print(val)
Esempio n. 4
0
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True):

	"""
	Args: 
	num_topics_list = list of number of topics, a model will be fitted for each
	save: indicates whether model should be saved
	Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics
	"""
	topics_dict = {}
	logfile = open(logfilename, 'w')
	for num_topics in num_topics_list:
		
		print('training', num_topics)
		np.random.seed(NUM)

		start_time = time.time()
		model = LdaMulticore(corpus=train_corpus, id2word=id2word,
							 num_topics=num_topics, iterations=iters,
							 eval_every=None, workers=workers,
							 chunksize=chunksize)
		end_time = time.time()

		if save:
			fname = 'data\\orig_' + str(num_topics) + 'topics.lda'
			model.save(fname)

		per_word_bound = model.log_perplexity(test_corpus)
		perplexity = np.exp2(-1.0 * per_word_bound)

		logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n')
		logfile.write('perplexity: ' + str(perplexity) + '\n')
		logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n')

		topics = model.show_topics(num_topics=num_topics, num_words=20)
		topics_dict[str(num_topics)] = topics
		for topic in topics:
			logfile.write('\n\t' + topic.encode('ascii', 'ignore')  + '\n')

	logfile.close()		
	return topics_dict
    passes=10,
    chunksize=5000,
    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
    workers=7,
)
print("--- %s seconds ---" % (time.time() - start_time))
fname = folder_name + 'LDA' + str(topic_number) + 'topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#perplexity
perplexity = model.log_perplexity(matutils.Sparse2Corpus(
    X, documents_columns=False),
                                  total_docs=None)

# batch LDA
model_eval = []
for k in range(2, 21):
    topic_number = k
    start_time = time.time()
    model = LdaMulticore(
        matutils.Sparse2Corpus(X, documents_columns=False),
        num_topics=topic_number,
        passes=10,
        chunksize=5000,
        id2word=dict([(i, s) for i, s in enumerate(vocab)]),
        workers=7,
    )
Esempio n. 6
0
#                                            num_topics=35, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)


pprint(model.print_topics())
doc_lda = model[corpus]
doc_lda[4]
model.get_document_topics(corpus)[1]

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary)
vis

mallet_path = '/home/ubuntu/Signal/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=35, id2word=id2word)

coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
Esempio n. 7
0
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    other_texts.append(stemmed_tokens)

other_corpus = [dictionary.doc2bow(text) for text in other_texts]

# unseen_doc = other_corpus[2]

# vector = ldamodel[unseen_doc]

# print(vector)

# generate LDA model-------------------------------------------------------------------------

my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]

for i in my_loop_num_topics:
    my_num_topics = i
    print(my_num_topics)
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
    myldamodel = LdaMulticore(corpus,
                              num_topics=my_num_topics,
                              id2word=dictionary,
                              workers=3,
                              alpha=1e-5,
                              eta=5e-1)
    print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5))
    print(myldamodel.log_perplexity(corpus))
    print(myldamodel.log_perplexity(other_corpus))
Esempio n. 8
0
    results = lda.print_topics()
    print("-------------")
    print("TOPICS (RAW RESULTS)...")
    print(results)

    parsed_topics = parse_topics(lda)
    print("-------------")
    print("TOPICS (PARSED RESULTS)...")
    pprint(parsed_topics)

    # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling
    topics = lda[bags_of_words]
    print(topics[0]) #> [(4, 0.3149784), (7, 0.47801575), (13, 0.20485382)]

    # a measure of how good the model is. lower the better.
    print("Perplexity:", lda.log_perplexity(bags_of_words)) #> -7.74115184561741

    cm = CoherenceModel(model=lda, texts=token_stream(NOVELS_DIRPATH), dictionary=dictionary, coherence="c_v")
    print("Coherence Score:", cm.get_coherence()) #> 0.3695864834032673

    #vis = pyLDAvis.gensim.prepare(lda, bags_of_words, dictionary)
    #vis

    exit()

    #
    # SPACY NAMED ENTITY APPROACH
    #

    nlp = spacy.load("en_core_web_md")
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=topic_number,passes=10,
                    chunksize=5000,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))
fname = folder_name+'LDA'+str(topic_number)+'topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#perplexity
perplexity = model.log_perplexity(matutils.Sparse2Corpus(X,documents_columns=False), total_docs=None)



# batch LDA
model_eval = []
for k in range(2,21):
    topic_number = k
    start_time = time.time()
    model = LdaMulticore(
                        matutils.Sparse2Corpus(X,documents_columns=False), 
                        num_topics=topic_number,passes=10,
                        chunksize=5000,
                        id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                        workers=7,
                        )
Esempio n. 10
0
def start(num_topics, kind):
    data = loader.load_data(kind)
    df = pd.DataFrame(data)
    cleaner.clean(df)

    nlps = {
        'it': spacy.load('it_core_news_lg'),
        'en': spacy.load('en_core_web_lg'),
        'fr': spacy.load('fr'),
        'de': spacy.load('de')
    }

    tokenizers = {
        'it': Tokenizer(nlps['it'].vocab),
        'en': Tokenizer(nlps['en'].vocab),
        'fr': Tokenizer(nlps['fr'].vocab),
        'de': Tokenizer(nlps['de'].vocab)
    }

    # Customize stop words by adding to the default list
    stop_words = []
    stop_words += nlps['it'].Defaults.stop_words
    stop_words += nlps['en'].Defaults.stop_words
    stop_words += nlps['fr'].Defaults.stop_words
    stop_words += nlps['de'].Defaults.stop_words
    stop_words += s.ALL_STOPWORDS
    stop_words = set(stop_words)

    # ALL_STOP_WORDS = spacy + gensim + wordcloud
    ALL_STOP_WORDS = stop_words.union(SW).union(stopwords)

    cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS)
    cleaner.lemmas(df, nlps)

    tok.tokenize_text(df)

    # Create a id2word dictionary
    id2word = Dictionary(df['lemma_tokens'])
    print(len(id2word))

    # Filtering Extremes
    id2word.filter_extremes(no_below=2, no_above=.99)
    print(len(id2word))

    # Creating a corpus object
    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

    # Instantiating a Base LDA model
    base_model = LdaMulticore(corpus=corpus,
                              num_topics=num_topics,
                              id2word=id2word,
                              workers=12,
                              passes=5)

    # Filtering for words
    words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()]

    # Create Topics
    topics = [' '.join(t[0:10]) for t in words]

    # Getting the topics
    for id, t in enumerate(topics):
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")

    # Compute Perplexity
    # a measure of how good the model is. lower the better
    base_perplexity = base_model.log_perplexity(corpus)
    print('\nPerplexity: ', base_perplexity)

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=base_model,
                                     texts=df['lemma_tokens'],
                                     dictionary=id2word,
                                     coherence='c_v')
    coherence_lda_model_base = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model_base)

    lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word)
    d = pyLDAvis.display(lda_display)

    today = date.today()
    directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/"
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    f = open(
        f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html",
        'w')
    f.write(d.data)
    f.close()

    vectorizer = CountVectorizer()
    data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])

    # Define Search Param
    search_params = {
        'n_components': [10, 15, 20, 25, 30],
        'learning_decay': [.5, .7, .9]
    }

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(data_vectorized)
    GridSearchCV(cv=None,
                 error_score='raise',
                 estimator=LatentDirichletAllocation(batch_size=128,
                                                     doc_topic_prior=None,
                                                     evaluate_every=-1,
                                                     learning_decay=0.7,
                                                     learning_method=None,
                                                     learning_offset=10.0,
                                                     max_doc_update_iter=100,
                                                     max_iter=10,
                                                     mean_change_tol=0.001,
                                                     n_components=10,
                                                     n_jobs=1,
                                                     perp_tol=0.1,
                                                     random_state=None,
                                                     topic_word_prior=None,
                                                     total_samples=1000000.0,
                                                     verbose=0),
                 iid=True,
                 n_jobs=1,
                 param_grid={
                     'n_topics': [10, 15, 20, 30],
                     'learning_decay': [0.5, 0.7, 0.9]
                 },
                 pre_dispatch='2*n_jobs',
                 refit=True,
                 return_train_score='warn',
                 scoring=None,
                 verbose=0)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
Esempio n. 11
0
my_loop_num_topics = list(range(1, 51))  # set number of topics to loop
my_loop_num_topics.append(100)
print(my_loop_num_topics)

training_fit = []
test_fit = []
for i in my_loop_num_topics:
    my_num_topics = i
    print(my_num_topics)
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
    myldamodel = LdaMulticore(training_set,
                              num_topics=my_num_topics,
                              id2word=dictionary,
                              workers=3,
                              alpha=1e-5,
                              eta=5e-1)
    print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5))
    print(myldamodel.log_perplexity(training_set))
    print(myldamodel.log_perplexity(test_set))
    training_fit.append(myldamodel.log_perplexity(training_set))
    test_fit.append(myldamodel.log_perplexity(test_set))

with open('training_fit.csv', 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(training_fit)

with open('test_fit.csv', 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(test_fit)
            level=logging.INFO)

        # Build LDA model with this number of topics
        lda_model = LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=topics,
            random_state=100,
            chunksize=200,
            passes=1000,
            #                                            iterations=5000,
            #                                            minimum_probability=0,
            per_word_topics=True)

        #Compute Perplexity
        perplexity[topics] = lda_model.log_perplexity(
            corpus)  # a measure of how good the model is. lower the better.

        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_lemmatized,
                                             dictionary=id2word,
                                             coherence='c_v')
        coherence[topics] = coherence_model_lda.get_coherence()

        #save results
        lda_model.save(
            f"trained_models/trained_lda_model_search_broad_{topics}")
        with open("data/perplexity.pkl", 'wb') as f:
            pkl.dump(perplexity, f)
        with open("data/coherence.pkl", 'wb') as f:
            pkl.dump(coherence, f)
Esempio n. 13
0
                beta=beta,
                iter=num_iterations)
            print run_id

            output_file = output_file_template.format(run_id=run_id)

            # Train and save
            print 'Training...'
            model = LdaMulticore(corpus,
                                 alpha=alpha,
                                 eta=beta,
                                 passes=50,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 iterations=num_iterations)
            # model.save(output_file)
            print 'Done training'

            # Print top 10 words in topics, if desired
            if print_topics:
                topics = model.show_topics(num_topics=4, formatted=False)
                for topic in topics:
                    for tup in topic[1]:
                        print tup[0] + ": " + str(tup[1])
                    print '\n'

            # Evaluate perplexity
            ll = model.log_perplexity(test_corpus)
            print "LL:   " + str(ll)
            print "Perp: " + str(np.exp2(-ll))