Esempio n. 1
0
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None):
    doc_clean = [min_char(doc).split() for doc in doc_clean]

    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6)
    if tfidfmodel:
       tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc')
       corpus = tfidf[corpus]

    ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True)
    print("#Tópicos LDA")
    for i in range(0, n_topics):
        temp = ldamodel.show_topic(i, n_words)
        terms = []
        for term in temp:
            terms.append(term)
        print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms]))
    print('Bound: ',ldamodel.bound(corpus))
    # Compute Perplexity
    print('Perplexity: ',ldamodel.log_perplexity(corpus))
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    if unseen_docs:
        corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs]
        for i, unseen_doc in enumerate(corpus_new):
            topic = None
            score = 0
            inference_doc = ldamodel[unseen_doc]
            print(unseen_docs[i])
            for index,tmpScore in inference_doc[0]:
                if tmpScore > score:
                    score = tmpScore
                    topic = ldamodel.print_topic(index, 5)
            print ("Score: {}\t Topic: {}".format(score, topic))
        print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new))

    print_result(ldamodel, doc_clean, corpus, n_topics, description)
    pickle.dump(corpus, open(description+'.pkl', 'wb'))
    dictionary.save(description+'dictionary.gensim')
    ldamodel.save(description+'_ldamodel.gensim')
Esempio n. 2
0
def lda_model_(documents, custom_stopwords_list, num_topics):
    log.info('In LDA for Topic Modelling function.')
    cleaned_documents = [
        preprocess_(document, custom_stopwords_list).split()
        for document in documents
    ]
    document_dictionary = corpora.Dictionary(cleaned_documents)
    td_matrix = [
        document_dictionary.doc2bow(document) for document in cleaned_documents
    ]
    model = LdaModel(corpus=td_matrix,
                     num_topics=num_topics,
                     id2word=document_dictionary,
                     passes=100)
    coherence_model = CoherenceModel(model=model,
                                     texts=cleaned_documents,
                                     dictionary=document_dictionary,
                                     coherence='c_v')
    log.debug('Coherence Score: {}\nPerplexity Score: {}'.format(
        coherence_model.get_coherence(), model.log_perplexity(td_matrix)))
    return model, coherence_model.get_coherence(), model.log_perplexity(
        td_matrix)
Esempio n. 3
0
def find_optimal_number_topics(corpus, id2word, min, max, step, texts):
    """ find optimal number of topics based on highest coherence and lowest perplexity"""
    range_coherence = {}
    range_perplexity = {}
    for n in np.arange(min, max, step):
        print(n)
        lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n, random_state=100,
                             update_every=5, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
        # Compute Perplexity - a measure of how good the model is. the lower the better
        perplexity_lda = lda_model.log_perplexity(corpus)
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        range_coherence.update({n: coherence_lda})
        range_perplexity.update({n: perplexity_lda})
    plt.plot(list(range_coherence.keys()), list(range_coherence.values()))
    plt.plot(list(range_perplexity.keys()), list(range_perplexity.values()))
    return range_coherence, range_perplexity
Esempio n. 4
0
def compete_number_of_words(detoken_data, token_data, min_num, max_num, step, random_state=None):

  '''
  number_of_words를 찾기 위한 함수 

  Parameters :
  -------------
  detoken_data : list 형태의 역토큰화된 데이터
  token_data : coherence 값을 계산하기 위한 token_data
  min_num : number of words range의 최솟값 min_num부터 시작
  max_num : number of words range의 최댓값 max_num까지 찾음
  step : min_num ~ max_num 까지 가기 위해 step을 얼마나 갈것인지
  random_state : 재현성을 주기 위해 설정, default = None

  Output :
  -------------
  coherence_value : Num of Words와 그에 따른 Coherence Value가 있는 DataFrame 반환
  
  '''

  coherence_value = pd.DataFrame(columns=['min_df', 'Perplexity Value','Coherence Value'])

  i = 0
  min_df = list(np.arange(min_num,max_num,step))
  for m in min_df :
    print("{} 번째, min_df = {}".format(i+1, m))

    vectorizer = CountVectorizer(min_df=m) # CountVectorizer 생성
    cv = vectorizer.fit_transform(detoken_data) # fit and transform

    dictionary = corpora.Dictionary([vectorizer.get_feature_names()])

    corpus = Sparse2Corpus(cv.T)

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, random_state=random_state)

    coherence_lda = CoherenceModel(model=lda_model, texts=token_data, dictionary=dictionary, coherence='c_v')

    coherence_value.loc[i] = [m, lda_model.log_perplexity(corpus),coherence_lda.get_coherence()]
    i += 1

  return coherence_value
def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, T, num_2_token):
    train_corpus = [zip(row.indices, row.data) for row in train_n_dw_matrix]

    for seed in [42, 7, 777, 12]:
        model = LdaModel(train_corpus,
                         alpha='auto',
                         id2word=num_2_token,
                         num_topics=T,
                         iterations=500,
                         random_state=seed)
        gensim_phi = exp_common.get_phi(model)
        gensim_theta = exp_common.get_theta(train_corpus, model)
        print('gensim perplexity')
        print(np.exp(-model.log_perplexity(train_corpus)))

        D, W = train_n_dw_matrix.shape
        random_gen = np.random.RandomState(seed)
        phi = common.get_prob_matrix_by_counters(
            random_gen.uniform(size=(T, W)).astype(np.float64))
        theta = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T)).astype(np.float64))
        phi, theta = default.Optimizer([regularizers.Additive(0.1, 0.)] * 100,
                                       verbose=False).run(
                                           train_n_dw_matrix, phi, theta)

        callback = experiments.default_callback(
            train_n_dw_matrix=train_n_dw_matrix,
            test_n_dw_matrix=test_n_dw_matrix,
            top_pmi_sizes=[5, 10, 20, 30],
            top_avg_jaccard_sizes=[10, 50, 100, 200],
            measure_time=True)
        callback.start_launch()
        callback(0, phi, theta)
        callback(1, gensim_phi, gensim_theta)

        print('artm')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[0]))

        print('gensim')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[1]))
Esempio n. 6
0
def build_model(raw_file, ret_file):
    """
    :param raw_file:
    :param retfile:
    :return:
    """
    all_tweets = load_all_tweets(raw_file)
    k = int(ret_file[ret_file.find('tweets_lda_') + 11])
    print('k={}'.format(k))
    idx2twetid = []
    common_texts = []
    for key, tweet in all_tweets.items():
        idx2twetid.append(key)
        tokens = tweet['cleaned'].split(' ')
        text = []
        for token in tokens:
            if token not in punc_words:
                text.append(token)
        common_texts.append(text)

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    print('begin to train')
    lda_model = LdaModel(common_corpus,
                         id2word=common_dictionary,
                         num_topics=k,
                         random_state=13)
    pprint(lda_model.print_topics(num_words=20))
    print('\nPerplexity: ', lda_model.log_perplexity(common_corpus))

    with open(ret_file, 'w', encoding='utf-8') as fout:
        for i, tweetid in enumerate(idx2twetid):
            tmp = lda_model[common_corpus[i]]
            lda_score = {}
            for ele in tmp:
                lda_score[str(ele[0])] = float(ele[1])
            all_tweets[tweetid]['lda' + str(k)] = lda_score
            fout.write(json.dumps(all_tweets[tweetid]))
            fout.write('\n')
Esempio n. 7
0
class CustomLda(object):
    def __init__(self, data=None, dictionary=None):
        """ initialize, data should be provided, only when unpickling class object it is not needed!"""
        self.data = data
        self.model = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.dictionary = dictionary
        if self.data is not None:
            if self.dictionary is None:
                self.dictionary = Dictionary(self.data)
            self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
        else:
            self.dictionary = None
            self.corpus = None
        self.distributed = None
        self.chuncksize = None
        self.passes = None
        self.update_every = None
        self.alpha = None
        self.eta = None
        self.decay = None
        self.offset = None
        self.eval_every = None
        self.gamma_threshold = None
        self.minimum_probability = None
        self.ns_conf = None
        self.minimum_phi_value = None
        self.per_word_topics = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.model = None
        self.coherence_model = None
        self.coherence = None
        self.coherence_type = None

    def train(self,
              num_topics,
              iterations=1500,
              random_state=1,
              distributed=False,
              chunksize=2000,
              passes=1,
              update_every=1,
              alpha='symmetric',
              eta=None,
              decay=0.5,
              offset=1.0,
              eval_every=10,
              gamma_threshold=0.001,
              minimum_probability=0.01,
              ns_conf=None,
              minimum_phi_value=0.01,
              per_word_topics=False,
              workers=1):
        """train lda model. If workers >1, goes multicore"""

        self.distributed = distributed
        self.chuncksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.alpha = alpha
        self.eta = eta
        self.decay = decay
        self.offset = offset
        self.eval_every = eval_every
        self.gamma_threshold = gamma_threshold
        self.minimum_probability = minimum_probability
        self.ns_conf = ns_conf
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics
        self.num_topics = num_topics
        self.iterations = iterations
        self.random_state = random_state
        self.workers = workers

        if self.workers > 1:
            self.model = LdaMulticore(
                workers=3,
                corpus=self.corpus,
                id2word=self.dictionary,
                iterations=self.iterations,
                num_topics=self.num_topics,
                random_state=self.
                random_state,  # distributed=self.distributed,
                chunksize=self.chuncksize,
                passes=self.passes,  # update_every= self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.
                minimum_probability,  # ns_conf=self.ns_conf,
                minimum_phi_value=self.minimum_phi_value,
                per_word_topics=self.per_word_topics)
        else:
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.dictionary,
                                  iterations=self.iterations,
                                  num_topics=self.num_topics,
                                  random_state=self.random_state,
                                  distributed=self.distributed,
                                  chunksize=self.chuncksize,
                                  passes=self.passes,
                                  update_every=self.update_every,
                                  alpha=self.alpha,
                                  eta=self.eta,
                                  decay=self.decay,
                                  offset=self.offset,
                                  eval_every=self.eval_every,
                                  gamma_threshold=self.gamma_threshold,
                                  minimum_probability=self.minimum_probability,
                                  ns_conf=self.ns_conf,
                                  minimum_phi_value=self.minimum_phi_value,
                                  per_word_topics=self.per_word_topics)
        print('Trained!')

    def _train_coherence_model(self, coherence_type='u_mass'):
        """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'"""
        self.coherence_model = CoherenceModel(model=self.model,
                                              texts=self.data,
                                              dictionary=self.dictionary,
                                              coherence=coherence_type)

    def _calculate_coherence(self, coherence_type='u_mass'):
        self._train_coherence_model(coherence_type=coherence_type)
        self.coherence = self.coherence_model.get_coherence()

    def get_coherence(self, coherence_type='u_mass'):
        if coherence_type != self.coherence_type:
            self._calculate_coherence(coherence_type=coherence_type)
        return self.coherence

    def get_topic_terms(self, num, topn=10):
        return self.model.get_topic_terms(num, topn=topn)

    def get_preplexity(self):
        return self.model.log_perplexity(self.corpus)

    def get_topics(self, num):
        return self.model.show_topics(num)

    def _make_visualization(self):
        """prepare visualisation for display/saving"""
        return pyLDAvis.gensim.prepare(self.model,
                                       self.corpus,
                                       self.dictionary,
                                       sort_topics=False)

    def display(self):
        """display LDAvis in notebook"""
        visualisation = self._make_visualization()
        return pyLDAvis.display(visualisation)

    def save_ldavis(self, filename='topic.html'):
        """save LDAvis to .html"""
        ldavis = self._make_visualization()
        pyLDAvis.save_html(ldavis, filename)

    def save_lda(self, filename):
        """save lda model only"""
        self.model.save(filename)

    def pickle(self, filename):
        """save class instance to file"""
        f = open(filename, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    @staticmethod
    def unpickle(filename):
        """read class instance from file"""
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def predict_topic(self, doc_list):
        """predict topic of document list (consists of strings"""
        topic_list = []
        for doc in doc_list:
            bow = self.dictionary.doc2bow(str(doc).split())
            topics_probs = self.model.get_document_topics(bow)
            topics_probs.sort(key=lambda tup: tup[1], reverse=True)
            topic_list.append(topics_probs)
        return topic_list
Esempio n. 8
0
# In[18]:

# calculate coherence metric for each of n topics in test set
coherence_model_1_per_topic = coherence_model_1test.get_coherence_per_topic()

# uncomment to print coherence_model_1_per_topic
# print(coherence_model_1_per_topic)

# #### Model #1 - Evaluate - Perplexity
# Calculate perplexity metric. Metric calculates and returns per-word likelihood bound using a chunk of documents as evaluation corpus. Output calculated statistics, including the perplexity=2^(-bound), to log at INFO level. Returns the variational bound score calculated for each word

# In[19]:

# calculate perplexity metric for model_1 train set (1000 pats dataset)
perplexity_model_1train = model_1.log_perplexity(corpus_1000train)
print(perplexity_model_1train)

# In[20]:

# calculate perplexity metric for model_1 test set (1000 pats dataset)
perplexity_model_1test = model_1.log_perplexity(corpus_1000test)
print(perplexity_model_1test)

# ### Model #1 - Predict

# #### Model #1 - Predict - Pickle model

# In[21]:

# update path with location to save pickled model
Esempio n. 9
0
import sys
import numpy as np

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.models import word2vec

data_file = sys.argv[1]
alpha = float(sys.argv[2])

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

for i in range(1, 31):
  lda = LdaModel(corpus = corpus, id2word = dic, num_topics = i, alpha = 0.01, random_state = 1)

  cm = CoherenceModel(model = lda, corpus = corpus, coherence = 'u_mass')
  coherence = cm.get_coherence()

  perwordbound = lda.log_perplexity(corpus)
  perplexity = np.exp2(-perwordbound)

  print(f"num_topics = {i}, coherence = {coherence}, perplexity = {perplexity}")
Esempio n. 10
0
def LDA_pd(data=data_path,
           list_keys=keywords,
           num_topics=num_topics,
           iterations=iterations,
           alpha=alpha,
           eta=eta,
           embeddings=embeddings,
           top=topn,
           output_path=output,
           use_keywords=use_keywords):

    output = open(output_path + '.output', 'w')
    output.write("Generating {} topics from {} initial keywords \n".format(
        num_topics, len(keywords)))
    output.write(
        "LDA model parameters:\n(1) alpha {}\n(2) eta {}\n(3) running {} iterations. \n"
        .format(alpha, eta, iterations))

    if use_keywords:  # if false, LDA is performed on all data (NOT Partial Data LDA)
        data_words = list(word_lists(data, list_keys))
        output.write("Standard set of keywords includes:\n" +
                     ', '.join(i for i in list_keys))
        if embeddings:
            display_log("Loading word embeddings")
            model = load_model(model_path)
            most_similar = grab_most_similar(list_keys, model=model, top=topn)
            list_keys = add_similar(list_keys, most_similar)
            output.write("Supplemented keyword list includes:\n" +
                         ', '.join(i for i in list_keys))
            output.write('\n')
            output.write(
                "Top {} most similar words added from word emeddings (if found) \n"
                .format(topn))
    else:
        data_words = list(word_lists_no_keywords(data_path))

    display_log("Created data word list of size {}".format(str(
        len(data_words))))

    # generate bigrams
    if bigrams:
        data_words = make_bigrams(data_words)
        display_log("Created bigrams word list")
        output.write("Topic integrates bigrams.\n\n")

    # create dictionary
    id2word = corpora.Dictionary(data_words)
    display_log("Created dictionary")

    # TDF
    corpus = [id2word.doc2bow(text) for text in data_words]
    display_log("Created corpus")

    #LDA model
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         chunksize=60,
                         passes=25,
                         alpha=alpha,
                         eta=eta,
                         iterations=iterations)
    display_log("Created LDA model")
    #pprint(lda_model.print_topics())

    topic_header = ["Topic " + str(i + 1) for i in range(num_topics)]
    topic_array = np.array(
        [lda_model.show_topic(i) for i in range(num_topics)]).T
    output.write("Topics\n-----------------------\n")
    output.write(
        tabulate(topic_array[0], headers=topic_header, tablefmt='github'))
    output.write("\n\n")
    output.write("Similarity Scores\n-----------------------\n")
    output.write(
        tabulate(topic_array[1], headers=topic_header, tablefmt='github'))
    output.write("\n\n")

    display_log("printed table into output file " + output_path)

    df_all = pd.DataFrame()
    topics_transposed = topic_array.T
    for i in range(num_topics):
        new = pd.DataFrame(topics_transposed[i],
                           columns=['Topic ' + str(i), 'score'])
        df_all = pd.concat([df_all, new], axis=1)
    df_all.to_csv(output_csv, index=False, encoding='utf-16')
    display_log("Exported topics and scores into csv file " + output_csv +
                '.csv')

    #coherence for LDA-PDs
    if coherence:
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_words,
                                             dictionary=id2word,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        output.write(
            "Coherence and Preplexity Scores\n-----------------------\n")
        output.write(
            "LDA-PD Model with {} keywords: \n Perplexity: {} \n Coherence: {}"
            .format(len(keywords), lda_model.log_perplexity(corpus),
                    coherence_lda))
        display_log("Coherence and Perplexity calculated, see " + output_path +
                    '.output')

    display_log("Log saved in " + output_path + '.log')
    display_log("Output saved in " + output_path + '.output')
    display_log("Topics saved in " + output_path + '.csv')

    return lda_model
Esempio n. 11
0
			run_id = "ldaU_K{K}_a{alpha_frac}-K_b{beta}_iter{iter}.gensim".format(K=num_topics, alpha_frac=alpha_frac, beta=beta, iter=num_iterations)
			print run_id

			output_file = output_file_template.format(run_id=run_id)

			# Train and save
			print 'Training...'
			model = LdaModel(corpus, 
				alpha=alpha, eta=beta,
				id2word=dictionary, num_topics=num_topics, iterations=num_iterations
			)
			# model = LdaMulticore(corpus, 
			# 	alpha=alpha, eta=beta,
			# 	id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2
			# )
			print 'Done training.'
			model.save(output_file)

			# Print top 10 words in topics, if desired
			if print_topics:
				topics = model.show_topics(num_topics=100, formatted=False)
				for topic in topics:
					for tup in topic[1]:
						print tup[0] + ": " + str(tup[1])
					print '\n'

			# Evaluate perplexity
			ll = model.log_perplexity(test_corpus)
			print "LL:   "+str(ll)
			print "Perp: "+str(np.exp2(-ll))
                    lda_model.update(corpus=train_data,
                                     decay=learning_decay,
                                     iterations=valid_iter)

                train_s.append(
                    CoherenceModel(model=lda_model,
                                   corpus=train_data,
                                   dictionary=dictionary,
                                   coherence='u_mass').get_coherence())
                test_s.append(
                    CoherenceModel(model=lda_model,
                                   corpus=test_data,
                                   dictionary=dictionary,
                                   coherence='u_mass').get_coherence())

                train_p.append(lda_model.log_perplexity(train_data))
                test_p.append(lda_model.log_perplexity(test_data))

            train_scores.append(train_s)
            test_scores.append(test_s)
            train_perplexities.append(train_p)
            test_perplexities.append(test_p)

            print "train_scores: ", train_scores[-1], " test_scores: ", test_scores[-1], \
                " train_perplexities: ", train_perplexities[-1], " test_perplexities: ", test_perplexities[-1]

        dict_num_topic[str(n_component) + '_topics'] = {
            "max_iter": max_iter,
            "valid_iter": valid_iter,
            "train_scores": train_scores,
            "test_scores": test_scores,
Esempio n. 13
0
            output_file = output_file_template.format(run_id=run_id)

            # Train and save
            print 'Training...'
            model = LdaModel(corpus,
                             alpha=alpha,
                             eta=beta,
                             id2word=dictionary,
                             num_topics=num_topics,
                             iterations=num_iterations)
            # model = LdaMulticore(corpus,
            # 	alpha=alpha, eta=beta,
            # 	id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2
            # )
            print 'Done training.'
            model.save(output_file)

            # Print top 10 words in topics, if desired
            if print_topics:
                topics = model.show_topics(num_topics=100, formatted=False)
                for topic in topics:
                    for tup in topic[1]:
                        print tup[0] + ": " + str(tup[1])
                    print '\n'

            # Evaluate perplexity
            ll = model.log_perplexity(test_corpus)
            print "LL:   " + str(ll)
            print "Perp: " + str(np.exp2(-ll))
Esempio n. 14
0
    # Build LDA model
    print('Training LDA model...')
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=options.num_topics,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=options.iterations,
        alpha='auto',
        per_word_topics=True
    )
    print('...done')

    print('Saving model...')
    model.save(model_path)
    print('...done')

    print('Topics found:')
    for i in range(options.num_topics):
        print(i, ' -> ', model.print_topic(i))
    doc_lda = model[corpus]

    # Compute Perplexity
    print('Perplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda)
Esempio n. 15
0
for idx, doc in enumerate(allmydocs):
    # if idx > num_docs:
    #     break
    doc = doc.lower()
    doc = re.split(' |, |\n|: |(|)', doc)
    doc = [elt for elt in doc if elt is not None]
    tokens = []
    for words in doc:
        cleaned = ''.join([i for i in words if i.isalpha()])
        if cleaned not in stop_words and 2 < len(cleaned):
            tokens.append(cleaned)
    cleaned_docs.append(tokens[:])

# Create a corpus from a list of texts
common_dictionary = Dictionary(cleaned_docs)
common_corpus = [common_dictionary.doc2bow(text) for text in cleaned_docs]
random.shuffle(common_corpus)
train = common_corpus[:int(len(common_corpus)*0.8)]
test = common_corpus[int(len(common_corpus)*0.8):]

lda = LdaModel(common_corpus, num_topics=25, iterations=10000, eval_every=2, chunksize=10000, passes=10)


perplex = lda.log_perplexity(common_corpus)
print('perplex', perplex)

# Save model to disk.
temp_file = datapath("model")
lda.save(temp_file)

Esempio n. 16
0
pprint(model_lda.show_topic(1, topn=10))


# In[89]:


pprint(model_lda.show_topics(num_topics=5, num_words=10))


# ### Evaluate - model #1

# In[91]:


# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus_train)
perplexity


# In[92]:


# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1


# In[94]:
Esempio n. 17
0
def find_best_model_log_perp(n_topic_range,
                             texts,
                             id2word,
                             corpus,
                             threshold=None,
                             random_state=42,
                             plot=True,
                             verbose=False):
    """
    Searches for the best model in a given range by log perplexity value

    Parameters:
    - `n_topic_range`
        a range of values for the `num_topics` parameter of a gensim LDA model to try
    - `texts`
        a list of documents broken into words
    - `id2word`
        a dictionary containing word encodings
    - `corpus`
        the result of mapping each word in `texts` to its value in `id2word`
    - `random_state` 
        a random state for use in a gensim LDA model
    - `threshold`
        a float that specifies a log perplexity value that if reached will cause the function to return early
    - `plot`
        a boolean specifying whether or not to plot log perplexity values against each `num_topics` value
    - `verbose`
        a boolean specifying whether or not to print updates

    Returns: a tuple containing the best model, the list of all models attempted, and a list of all log perplexity values obtained, respectively.
    """

    models = []
    perp_vals = []

    for n_topics in n_topic_range:

        # Print percentage progress
        if verbose:
            diff = max(n_topic_range) - n_topic_range.start
            print(
                str(round(100 * (n_topics - n_topic_range.start) / diff, 1)) +
                "% done")

        lda_model = LdaModel(corpus=corpus,
                             id2word=id2word,
                             num_topics=n_topics,
                             random_state=random_state,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)

        p = lda_model.log_perplexity(corpus)

        models.append(lda_model)
        perp_vals.append(p)

    if threshold is not None and p < threshold:
        if verbose:
            print('Returning early with a log perplexity value of ' + str(p))

        if plot:
            actual_range = range(n_topic_range.start,
                                 n_topics + n_topic_range.step,
                                 n_topic_range.step)
            plt.plot(actual_range, perp_vals, 'b')
            plt.show()

        return lda_model, models, perp_vals

    if plot:
        # The portion of the range that was actually iterated through
        plt.plot(n_topic_range, perp_vals, 'b')
        plt.show()

    return models[np.argmin(perp_vals)], models, perp_vals