Esempio n. 1
0
def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
def test_perplexity_input_format():
    # Test LDA perplexity for sparse and dense input
    # score should be the same for both dense and sparse input
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    lda.fit(X)
    perp_1 = lda.perplexity(X)
    perp_2 = lda.perplexity(X.toarray())
    assert_almost_equal(perp_1, perp_2)
Esempio n. 3
0
def plot_perplexity_iter(A_tfidf, num_topics):
    
    print "computing perplexity vs iter..."
    max_iter = 5
    perplexity = []
    em_iter = []
    for sweep in range(1,max_iter+1):
        lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)    
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end    
    np.save('./data/perplexity_iter.npy', perplexity)
    
    f = plt.figure()
    plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_iter.png')
Esempio n. 4
0
def plot_perplexity_topics(A_tfidf):
    
    print "computing perplexity vs K..."
    max_iter = 5    #based on plot_perplexity_iter()
    #num_topics = np.linspace(2,20,5).astype(np.int)
    num_topics = np.logspace(1,2,5).astype(np.int)
    perplexity = []
    em_iter = []
    for k in num_topics:
        lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "K= %d, elapsed time: %.4f sec" %(k, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end
    
    np.save('./data/perplexity_topics.npy', perplexity)
    np.save('./data/perplexity_topics2.npy', num_topics)    
    
    f = plt.figure()
    plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('Number of Topics, K')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_topics.png')
Esempio n. 5
0
def plot_perplexity_batch(A_tfidf, num_docs):
    
    print "computing perplexity vs batch size..."
    max_iter = 5
    num_topics = 10
    batch_size = np.logspace(6, 10, 5, base=2).astype(int)
    perplexity = np.zeros((len(batch_size),max_iter))
    em_iter = np.zeros((len(batch_size),max_iter))
    for ii, mini_batch in enumerate(batch_size):
        for jj, sweep in enumerate(range(1,max_iter+1)):
            lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1)
            tic = time()
            lda.fit(A_tfidf)  #online VB
            toc = time()
            print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
            perplexity[ii,jj] = lda.perplexity(A_tfidf)
            em_iter[ii,jj] = lda.n_batch_iter_
        #end
    #end
    np.save('./data/perplexity.npy', perplexity)
    np.save('./data/em_iter.npy', em_iter)    
    
    f = plt.figure()
    for mb in range(len(batch_size)):
        plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb]))
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_batch.png')
Esempio n. 6
0
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        distr_1 = lda_1.fit_transform(X)
        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)

        distr_2 = lda_2.fit_transform(X)
        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_perplexity(method):
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit(X)
    perp_1 = lda_1.perplexity(X, sub_sampling=False)

    lda_2.fit(X)
    perp_2 = lda_2.perplexity(X, sub_sampling=False)
    assert_greater_equal(perp_1, perp_2)

    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
    assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)
Esempio n. 10
0
def test_topic_ks(text, ck = 80): #text is a list of documents

    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform(text)

    print("testing Ks...")
    cks = range(ck)
    candidate_ks = cks[40:]
    for number_topics in candidate_ks:
        print("K =", number_topics)
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(count_data)

        # Log Likelihood: Higher the better
        print("---> Log Likelihood: ", lda.score(count_data))

        # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
        print("---> Perplexity: ", lda.perplexity(count_data))
Esempio n. 11
0
def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components,
                                    max_iter=1,
                                    learning_method='batch',
                                    random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)
 def __init__(self, X, features, Klist=list(range(1, 10)), random_state=0):
     self.Klist = Klist
     self.features = features
     self.random_state = random_state
     self.X = X
     self.lda = []
     self.perplex = []
     self.score = []
     for k in Klist:
         lda = LatentDirichletAllocation(n_components=k,
                                         random_state=random_state)
         lda.fit(X)
         self.lda.append(lda)
         px = lda.perplexity(X)
         ll = lda.score(X)
         self.perplex.append(px)
         self.score.append(ll)
         print('K = %i, perplex = %f, log-like = %f' % (k, px, ll))
Esempio n. 13
0
def lda_analysis(tf, tf_vectorizer):
    """
    lda分析
    :param tf:
    :param tf_vectorizer:
    :return:
    """
    # 设置主题数
    n_topics = 2

    lda = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=100,
                                    learning_method='online',
                                    learning_offset=50,
                                    random_state=0)
    lda.fit(tf)

    # 显示主题数 model.topic_word_
    # print(lda.components_)
    # # 几个主题就是几行 多少个关键词就是几列
    # print(lda.components_.shape)

    # 计算困惑度
    print(u'困惑度:')
    print(lda.perplexity(tf, sub_sampling=False))

    # 主题-关键词分布
    def print_top_words(model, tf_feature_names, n_top_words):
        for topic_idx, topic in enumerate(
                model.components_):  # lda.component相当于model.topic_word_
            print('Topic #%d:' % topic_idx)
            print(' '.join([
                tf_feature_names[i]
                for i in topic.argsort()[:-n_top_words - 1:-1]
            ]))
            print("")

    # 定义好函数之后 暂定每个主题输出前20个关键词
    n_top_words = 20
    tf_feature_names = tf_vectorizer.get_feature_names()
    # 调用函数
    print_top_words(lda, tf_feature_names, n_top_words)

    return lda
Esempio n. 14
0
def lda(data):
    tf_ModelPath = os.path.join('model', 'tfVector.model')  # 保存词频模型
    lda_ModelPath = os.path.join('model', 'ldaModels.model')  # 保存训练的lda模型
    bestModelPath = os.path.join('model', 'bestLDAModel.model')
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,)
    tf = tf_vectorizer.fit_transform(data)

    lda_models = []
    for idx, n_topic in enumerate(n_topics):
        lda = LatentDirichletAllocation(n_components = n_topic,
                                        max_iter=8000,
                                        learning_method='batch',
                                        evaluate_every=200,
                                        perp_tol=0.01)
        t0 = time()
        lda.fit(tf)
        perplexityLst[idx] = lda.perplexity(tf)
        lda_models.append(lda)
    print("残差数组结果为:", perplexityLst)
    print("# of Topic: %d, " % n_topics[idx], end=' ')
    print("done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_), end=' ')
    print("Perplexity Score %0.3f" % perplexityLst[idx])

    # 打印最佳模型
    best_index = perplexityLst.index(min(perplexityLst))
    best_n_topic = n_topics[best_index]
    best_model = lda_models[best_index]
    print("Best # of Topic: ", best_n_topic)
    print("Best Model: ")

    # 保存每个n_topics下的LDA模型,以便后续查看使用
    joblib.dump(tf_vectorizer, tf_ModelPath)
    joblib.dump(lda_models, lda_ModelPath)
    joblib.dump(best_model, bestModelPath)

    # 保存并输出topic_word矩阵
    print("#########Topic-Word Distribution#########")
    tf_vectorizer._validate_vocabulary()
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(best_model, tf_feature_names, n_top_words)
    # print(docres)
    # joblib.dump(tf_vectorizer, tf_ModelPath)
    return best_model, tf_vectorizer
Esempio n. 15
0
def train_lda():
    # from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.externals import joblib  # 也可以选择p
    tf_ModelPath = r'E:\能搜\tf_model.pkl'
    docLst = get_docLst()
    tf_vectorizer = joblib.load(tf_ModelPath)
    tf = tf_vectorizer.fit_transform(docLst)
    # xx=tf_vectorizer.get_feature_names()
    from sklearn.decomposition import LatentDirichletAllocation
    n_topics = 13
    lda = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=300,
                                    learning_method='batch')
    lda.fit(tf)  # tf即为Document_word Sparse Matrix

    n_top_words = 20
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    print('lda:', lda.perplexity(tf))
Esempio n. 16
0
def lda_train():
    tf, count_vec = load_data_vector()
    n_topics = 20
    lda = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=10,
                                    learning_method='batch',
                                    random_state=0,
                                    perp_tol=0.01,
                                    topic_word_prior=0.2,
                                    n_jobs=-1)
    lda.fit(tf)
    doc_topic_dist = lda.transform(tf)
    print(doc_topic_dist)

    n_top_words = 20
    tf_feature_names = count_vec.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

    print(lda.perplexity(tf))
Esempio n. 17
0
def lda_model(mat):
    print('开始训练lda模型')
    n_topic = 50
    lda = LatentDirichletAllocation(n_components=n_topic,
                                    max_iter=1000,
                                    learning_method='batch')
    docres = lda.fit_transform(mat)  # Document_word Sparse Matrix      返回文档主题矩阵

    # 困惑度
    print('困惑度为:' + lda.perplexity(mat))  # 收敛效果

    print(len(docres))  # 文档数目
    print(len(docres[0]))  # 主题数目
    print('lda模型训练结束')

    # 保存lda模型
    docres.dump('doc_topic_result.dat')

    return docres
Esempio n. 18
0
def k_grid_search(X, test_size=0.25, gridval=[10, 100, 10], n_iter=30, seed=23):
    X_train, X_test = train_test_split(
        X, test_size=test_size, random_state=seed
        )
    grid = range(gridval[0], gridval[1], gridval[2])
    loglik = list()
    perplex = list()
    for k in grid:
        print("Estimating model at k: {}".format(k))
        lda = LatentDirichletAllocation(
            n_components=k, max_iter=n_iter, learning_method='online',
            learning_offset=50., random_state=seed, n_jobs=6
            )
        lda.fit(X_train)
        loglik.append(lda.score(X_test))
        perplex.append(lda.perplexity(X_test))
        lda = None

    return list(grid), loglik, perplex
Esempio n. 19
0
class Model:

    def __init__(self, V, K=None, train=True):
        """
        V: doc-term matrix (n docs x n terms)
        K: number of topics (n topics)
        """
        self.V = V
        if train:
            self.K = K
            self.model = LatentDirichletAllocation(n_topics=self.K, max_iter=25, learning_method='batch')

    def train(self):
        """
        V: doc-term matrix (n docs x n terms)
        W,H: factorization W*H, W is doc-topic, H is topic-term
        """
        self.W = self.model.fit_transform(self.V)
        self.H = self.model.components_

    def predict(self):
        """
        :return: doc-topic matrix (W), where V = W*H
        """
        self.W = self.model.transform(self.V)

    def load(self,filename):
        """Load vectorizer by unpickling."""
        with open(filename, 'rb') as fid:
            self.model = pickle.load(fid)
        self.H = self.model.components_

    def save(self, filename):
        """Save vectorizer by pickling."""
        with open(filename, 'wb') as fid:
            pickle.dump(self.model, fid)

    def calculate_perplexity(self):
        """
        :return: perplexity of model for this dataset
        """
        return self.model.perplexity(self.V, self.W)
Esempio n. 20
0
def plot_perplexity_batch(A_tfidf, num_docs):

    print "computing perplexity vs batch size..."
    max_iter = 5
    num_topics = 10
    batch_size = np.logspace(6, 10, 5, base=2).astype(int)
    perplexity = np.zeros((len(batch_size), max_iter))
    em_iter = np.zeros((len(batch_size), max_iter))
    for ii, mini_batch in enumerate(batch_size):
        for jj, sweep in enumerate(range(1, max_iter + 1)):
            lda = LatentDirichletAllocation(n_topics=num_topics,
                                            max_iter=sweep,
                                            learning_method='online',
                                            batch_size=mini_batch,
                                            random_state=0,
                                            n_jobs=-1)
            tic = time()
            lda.fit(A_tfidf)  #online VB
            toc = time()
            print "sweep %d, elapsed time: %.4f sec" % (sweep, toc - tic)
            perplexity[ii, jj] = lda.perplexity(A_tfidf)
            em_iter[ii, jj] = lda.n_batch_iter_
        #end
    #end
    np.save('./data/perplexity.npy', perplexity)
    np.save('./data/em_iter.npy', em_iter)

    f = plt.figure()
    for mb in range(len(batch_size)):
        plt.plot(em_iter[mb, :],
                 perplexity[mb, :],
                 color=np.random.rand(3, ),
                 marker='o',
                 lw=2.0,
                 label='mini_batch: ' + str(batch_size[mb]))
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_batch.png')
Esempio n. 21
0
def build_topics(use_spacy=True):
    greek_stopwords = build_greek_stoplist()
    data_samples, indices = build_data_samples(use_spacy=use_spacy)
    greek_stopwords, words = build_gg_stoplist(data_samples, greek_stopwords)

    # Initial Parameters
    no_features = 1000  # Number of features
    n_samples = len(data_samples)  # Len of data samples
    no_top_words = 100  # Number of top words in each topic
    n_components = 100  # Number of topics
    # How many correlations under each topic
    no_top_data_samples = math.ceil(n_samples / n_components)

    # LDA can only use raw term counts for LDA because it is a probabilistic
    # graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=no_features,
                                    stop_words=greek_stopwords)
    tf = tf_vectorizer.fit_transform(data_samples)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda_model = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10,
                                          learning_method='online',
                                          learning_offset=50.,
                                          verbose=1,
                                          n_jobs=cpu_count() - 1,
                                          random_state=0)
    lda_model.fit(tf)

    print("Best Perplexity Score: ", lda_model.perplexity(tf))

    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_

    graph_lda, topics, top_doc_indices = process_topics(
        lda_H, lda_W, tf_feature_names, data_samples, no_top_words,
        no_top_data_samples, indices)

    pickle.dump(lda_model, open('lda_model.pickle', 'wb'))
    pickle.dump(tf, open('tf.pickle', 'wb'))
Esempio n. 22
0
 def run_perplexity_grid_search(self):
     i_counter = 1
     n_topic_range = range(self.min_n_topics, (self.max_n_topics + 1))
     n_iterations = len(n_topic_range)
     perplexity_list = []
     for i in n_topic_range:
         print_timestamp_message(
             f'Starting lda fit iteration {i_counter} of {n_iterations}')
         fit_lda = LatentDirichletAllocation(
             n_components=i,
             max_iter=self.max_iter,
             learning_method=self.learning_method,
             random_state=self.random_state).fit(self.tfid_vector)
         perplexity_list.append(fit_lda.perplexity(self.tfid_vector))
         i_counter += 1
     output_df = pd.DataFrame({
         'n_topics': list(n_topic_range),
         'perplexity': perplexity_list
     })
     return output_df
def fit_topic_model(tweets: List[np.ndarray], n_components: int, n_words: int, vocab: List[str], trials: int):
    best_model = None
    best_perplexity = 1e10
    
    for _ in range(trials):
        lda = LatentDirichletAllocation(n_components)
        lda.fit(tweets)

        perplexity = lda.perplexity(tweets)
        if perplexity < best_perplexity:
            best_perplexity = perplexity
            best_model = lda

    print('Best Perplexity: {0}'.format(perplexity))

    for index, component in enumerate(best_model.components_):
        top_indices = np.argsort(component)[::-1][:n_words]
        topic_words = [vocab[i] for i in top_indices]

        print('Topic {0}: {1}'.format(index, ' '.join(topic_words)))
Esempio n. 24
0
def test_topic_ks(text, ck=20, number_words=10):  #text is a list of documents
    print("cleaning and vectorizing....")

    for i in range(len(text)):

        text[i] = text[i].replace('‘', '\'').replace('’', '\'').replace(
            '“', '"').replace('”', '"').replace('—', '-').replace('\n', ' ')

        text[i] = text[i].translate(
            str.maketrans(string.punctuation,
                          ' ' * len(string.punctuation))).lower()

        word_list = text[i].split(" ")

        go_words = [
            word for word in [word for word in word_list if word not in stops]
        ]

        text[i] = ' '.join(go_words)

    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform(text)
    # plot_10_most_common_words(count_data, count_vectorizer)

    print("Testing Numbers of Topics (k)")
    cks = range(ck)
    candidate_ks = cks[
        1:]  #could filter to every other, but for now keep as is
    prev_prep = 0

    print("{:<3}\t{:<7}\t{:<7}".format('k:', 'perplexity:', 'delta:'))
    for number_topics in candidate_ks:
        # print("K =", number_topics)
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(count_data)

        perp = lda.perplexity(count_data)

        print("{:<3}\t{:<7.3f}\t{:<7.3f}".format(number_topics, perp,
                                                 perp - prev_prep))
        prev_prep = perp
def lda_build(data, savepath, n_topic):
    """
    在原有tfidf或cv的基础上训练lda
    :param data:
    :return:
    """

    tv = pickle.load(open("MODELS\\tfidf\\tfidf.pk", "rb"))
    # tv = pickle.load(open("MODELS\\tfidf\\cv.pk", "rb"))
    data = fenci(data)
    data_tfidf = tv.transform(data)
    data_tfidf = data_tfidf.toarray()
    print(data_tfidf.shape)
    lda = LatentDirichletAllocation(n_components=n_topic,
                                    max_iter=1000,
                                    verbose=True)
    lda.fit(data_tfidf)

    with open(savepath, "wb") as f:
        pickle.dump(lda, f)
    print(lda.perplexity(data_tfidf))
def Proceeding_LDA(n_component, ngram_tf_train):
    print("Fitting LDA models with tf features,")
    print(" n_components = %d" % n_component)

    lda = LatentDirichletAllocation(
        n_components = n_component, 
        learning_method = 'online', 
        random_state = 0,
        # doc_topic_prior = 1.0,
        # topic_word_prior = 1.0
        )
    lda.fit(ngram_tf_train)
    lda_train = lda.fit_transform(ngram_tf_train)
    lda_train_perplexity = lda.perplexity(ngram_tf_train)
    # To use ngram_tf_text:
    # lda_test = lda.fit(ngram_tf_test)

    print("lda_train:", type(lda_train), np.shape(ngram_tf_train))
    print("lda_train_perplexity:", lda_train_perplexity)
    
    return lda, lda_train, lda_train_perplexity
Esempio n. 27
0
def lda_decomp(t,
               n_components,
               learning_method="online",
               learning_offset=10.0,
               max_iter=20,
               random_state=1):
    #t0=time()
    #print(f"Fit LDA with {n_components} components")
    lda = LatentDirichletAllocation(n_components=n_components,
                                    max_iter=max_iter,
                                    learning_method=learning_method,
                                    learning_offset=learning_offset,
                                    random_state=random_state).fit(t)
    #print(f"Transform TD/IDF matrix with {n_components} components LDA")
    t_lda = lda.transform(t)
    score = lda.score(t)
    perplexity = lda.perplexity(t)
    #print("Approximate log likelihood score (higher the better): %.3f" % score)
    #print("Approximate perplexity (lower the better): %.3f" % perplexity)
    #print("done in %0.3fs." % (time() - t0))
    return (lda, t_lda)
Esempio n. 28
0
def train_topic_models(file):

    corpus = dtm(file, 10000)
    features = corpus.columns.values
    y = pd.read_csv(file)['class']
    lda_5 = LatentDirichletAllocation(n_topics=5,
                                      max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=0).fit(corpus)

    lda_10 = LatentDirichletAllocation(n_topics=5,
                                       max_iter=5,
                                       learning_method='online',
                                       learning_offset=50.,
                                       random_state=0).fit(corpus)

    components_5 = np.argsort(lda_5.components_)[::1]
    components_10 = np.argsort(lda_10.components_)[::1]
    print("Top 10 word for 5 topic model")

    for i, item in enumerate(components_5):
        words = []
        for j in range(0, 10):
            words += features[components_5[i, j]]
        print(words)

    print("Top 10 word for 10 topic model")
    for i, item in enumerate(components_10):
        words = []
        for j in range(0, 10):
            words += features[components_10[i, j]]
        print(words)
    for i in range(2, 11):
        lda = LatentDirichletAllocation(n_topics=i,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0).fit(corpus)
        print("Perplexity{}:{}".format(i, lda.perplexity(corpus)))
Esempio n. 29
0
def run_lda(documents,
            feature_names,
            saveFileDir,
            topic_nums=10,
            top_words_nums=20):
    lda = LatentDirichletAllocation(n_topics=topic_nums,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0).fit(documents)

    saveFileHeader = "%s/LDA_TopWords_Topic%s" % (saveFileDir, topic_nums)
    ### save lda outcomes
    saveFile = "%s.txt" % (saveFileHeader)
    if os.path.exists(saveFile):
        os.remove(saveFile)

    ## Save Topic top words
    save_topics(lda, feature_names, saveFile, topic_nums, top_words_nums)

    ## Save Topic-words Matrix
    np.savetxt("%s_Topic_Words_matrix.txt" % (saveFileHeader),
               lda.components_,
               fmt="%.6f")

    ## Save documents-topics
    documents_topics = lda.transform(documents)
    np.savetxt("%s_Document_Topics_matrix.txt" % (saveFileHeader),
               documents_topics,
               fmt="%.6f")
    np.savetxt("%s_Document_Topic.txt" % (saveFileHeader),
               np.argmax(documents_topics,
                         axis=1).reshape(len(documents_topics), 1),
               fmt="%d")

    ## Save perplexity
    # print(lda.perplexity(documents))
    np.savetxt("%s_perplexity.txt" % (saveFileHeader),
               [-1, lda.perplexity(documents)],
               fmt="%.6f")
Esempio n. 30
0
def plot_perplexity_topics(A_tfidf):

    print "computing perplexity vs K..."
    max_iter = 5  #based on plot_perplexity_iter()
    #num_topics = np.linspace(2,20,5).astype(np.int)
    num_topics = np.logspace(1, 2, 5).astype(np.int)
    perplexity = []
    em_iter = []
    for k in num_topics:
        lda = LatentDirichletAllocation(n_topics=k,
                                        max_iter=max_iter,
                                        learning_method='online',
                                        batch_size=512,
                                        random_state=0,
                                        n_jobs=-1)
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "K= %d, elapsed time: %.4f sec" % (k, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end

    np.save('./data/perplexity_topics.npy', perplexity)
    np.save('./data/perplexity_topics2.npy', num_topics)

    f = plt.figure()
    plt.plot(num_topics,
             perplexity,
             color='b',
             marker='o',
             lw=2.0,
             label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('Number of Topics, K')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_topics.png')
def LDA_SK(data_vectorized, vectorizer):
    #Build LDA Model
    '''lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
    '''
    lda_model = LatentDirichletAllocation(batch_size=128,
                                          doc_topic_prior=None,
                                          evaluate_every=-1,
                                          learning_decay=0.7,
                                          learning_method='online',
                                          learning_offset=10.0,
                                          max_doc_update_iter=100,
                                          max_iter=10,
                                          mean_change_tol=0.001,
                                          n_components=10,
                                          n_jobs=-1,
                                          n_topics=10,
                                          perp_tol=0.1,
                                          random_state=100,
                                          topic_word_prior=None,
                                          total_samples=1000000.0,
                                          verbose=0)

    lda_output = lda_model.fit_transform(data_vectorized)

    #print(lda_model)  # Model attributes

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))
    return lda_output
Esempio n. 32
0
def cluster_sk_latent_dirichlet_allocation(content):
    """ SK LDA """
    _config = LatentDirichletAllocation(
        n_components=content['n_components'],
        doc_topic_prior=None,
        topic_word_prior=None,
        learning_method=content['learning_method'],
        learning_decay=content['learning_decay'],
        learning_offset=content['learning_offset'],
        max_iter=10,
        batch_size=128,
        mean_change_tol=content['mean_change_tol'],
        n_jobs=-1)
    _result = _config.fit(content['data']).transform(content['data'])
    return httpWrapper(json.dumps({
        'result': _result.tolist(),
        'components': _config.components_.tolist(),
        'batchIter': _config.n_batch_iter_,
        'nIter': _config.n_iter_,
        'perplexity': _config.perplexity(content['data']),
        'score': _config.score(content['data'])
    }, ignore_nan=True ))
Esempio n. 33
0
class DMFVI(Model):
    MODEL_NAME = "dmfvi"
    _default_cfg = {
        "learning_method": "batch",
        "max_iter": 10,
        "batch_size": 128,
        "perp_tol": 0.1,
        "evaluate_every": 10
    }

    def __init__(self, cfg, train_cfg):
        super(DMFVI, self).__init__(cfg, train_cfg)

        self.cfg = copy.deepcopy(self._default_cfg)
        self.cfg.update(cfg)
        model_kwargs = {k: v for k, v in self.cfg.iteritems() if k in self._default_cfg}
        self.model = LatentDirichletAllocation(n_components=self.topic_dim, verbose=2, **model_kwargs)
        print("DMFVI: Use model configration:\n{}".format("\n".join("\t{:30}: {}".format(k, v) for k, v in sorted(model_kwargs.iteritems(), key=lambda item: item[0]))))

    def perplexity(self, x):
        return self.model.perplexity(np.array(x))

    def topic_prop(self, x):
        return self.model.transform(x)

    @property
    def topic_components(self):
        return self.model.components_

    def train(self):
        train_data = self.reader.get_data_from_type("train")
        self.model.fit(np.array([self.reader.onehot(data) for data in train_data if data != []]))
        print ("{}: trained for {} epochs; {} EM iterations.".format(datetime.now(), self.model.n_iter_, self.model.n_batch_iter_))

    def save(self, path):
        cPickle.dump(self.model, open(path, "w"))

    def load(self, path):
        self.model = cPickle.load(open(path, "r"))
Esempio n. 34
0
def run_multiple_LDA(biom_data, file_name, n_com_list):
    '''Return list of LDA models with number of communities specified in n_com_list

    Extract sparse matrix from biom-format. Run scikit-learn LDA for each number of communities specified.
    Calculate final perplexity of training data and time to run.
    '''
    models = []
    SampleX = biom_data.matrix_data.transpose().astype('int')
    f = open(file_name, 'wb')
    for i in n_com_list:
        starttime = time.time()
        model = LatentDirichletAllocation(n_components=i,
                                          learning_method='batch',
                                          max_iter=100,
                                          evaluate_every=10,
                                          max_doc_update_iter=100)
        model.fit(SampleX)
        print('perplexity', model.perplexity(SampleX))
        endtime = time.time()
        print(endtime - starttime)
        pickle.dump(model, f)
        models.append(model)
    return models
Esempio n. 35
0
def plot_perplexity_iter(A_tfidf, num_topics):

    print "computing perplexity vs iter..."
    max_iter = 5
    perplexity = []
    em_iter = []
    for sweep in range(1, max_iter + 1):
        lda = LatentDirichletAllocation(n_topics=num_topics,
                                        max_iter=sweep,
                                        learning_method='online',
                                        batch_size=512,
                                        random_state=0,
                                        n_jobs=-1)
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "sweep %d, elapsed time: %.4f sec" % (sweep, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end
    np.save('./data/perplexity_iter.npy', perplexity)

    f = plt.figure()
    plt.plot(em_iter,
             perplexity,
             color='b',
             marker='o',
             lw=2.0,
             label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_iter.png')
Esempio n. 36
0
File: lda.py Progetto: dpakpdl/NLP
def analyser(data):
    _, data_vectorized = get_vectorized_data(data)
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_output)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

    # See model parameters
    pprint(lda_model.get_params())
Esempio n. 37
0
def Choosing_n_components(n_features, var_n_components, ngram_range,
                          train_data, stop_words):
    # split_ratio = int(len(train_data)*0.7)
    split_ratio = int(len(train_data) * 1)
    perplexities = []

    for i in var_n_components:
        print("\n Start LDA iteration with var_n_components")
        n_components = i
        ngram_tf = CountVectorizer(stop_words=stop_words,
                                   ngram_range=ngram_range,
                                   max_features=n_features)
        ngram_tf_train = ngram_tf.fit_transform(train_data[:split_ratio])
        # ngram_tf_test = ngram_tf.transform(train_data[split_ratio:])

        print("ngram_tf_train_fit_transformed:", type(ngram_tf_train),
              "np.shape:", np.shape(ngram_tf_train))
        print(
            "Fitting LDA models with tf features,",
            "n_components = %d, n_features = %d" % (n_components, n_features))

        lda = LatentDirichletAllocation(n_components=n_components,
                                        learning_method='online',
                                        random_state=0)

        lda.fit(ngram_tf_train)
        lda_train = lda.fit_transform(ngram_tf_train)
        print("lda_train_data:", np.shape(lda_train))
        # lda_test = lda.transform(ngram_tf_test)
        # print("lda_test:", type(lda_test), "np.shape:", np.shape(ngram_tf_test))

        lda_train_perplexity = lda.perplexity(ngram_tf_train)
        perplexities.append(lda_train_perplexity)
        print("lda_train_perplexity:", lda_train_perplexity)

    return perplexities
# Count vectorizer
vectorizer = CountVectorizer(stop_words=stop_words,token_pattern='[a-zA-Z0-9]{3,}',)

# Use a list of the full documents as the input, not the tokens
data_vectorized=vectorizer.fit_transform(tlj['Reviews'])  

# Build sklearn LDA model
skl_lda_model = LatentDirichletAllocation(n_components=20,        # Let's start on the higher end of topics
                                         max_iter=10, 
                                         learning_method='batch',
                                         random_state=100, 
                                         batch_size=128, 
                                         evaluate_every= -1,  # Don't compute perplexity with every iteration
                                         n_jobs  = -1         # Use all available CPUs
                                         )        

# Fit model
start_time = time.time()
skl_lda_model.fit(data_vectorized)
end_time = time.time()

# Print metrics and params
print("Model Fit Time:", end_time-start_time)
print("Log-Likelihood: ", skl_lda_model.score(data_vectorized))
print("Perplexity: ", skl_lda_model.perplexity(data_vectorized))
pprint(skl_lda_model.get_params)

# Save results
pickle.dump(vectorizer, open('../../../data/pickles/lda/lda_skl_default_vectorizer.pkl', 'wb'))
pickle.dump(data_vectorized, open('../../../data/pickles/lda/lda_skl_default_data_vectorized.pkl', 'wb'))
pickle.dump(skl_lda_model, open('../../../data/pickles/lda/lda_skl_default_model.pkl', 'wb'))
Esempio n. 39
0
vectorizer.get_feature_names()

vect_df = pd.DataFrame(X.toarray(), columns=[vectorizer.get_feature_names()])
vect_df.shape
vect_df.head()

lda_range= range(1,20)
lda_eval = []

for n in lda_range:
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(vect_df)
    score = lda.score(vect_df)
    perplexity = lda.perplexity(vect_df)
    print n,score,perplexity
    lda_eval.append({'topics':n,'score':score,'perplexity':perplexity})

for item in lda_eval:
    print item

lda = LatentDirichletAllocation(n_topics=5, n_jobs=-1)


topics = lda.fit_transform(vect_df)
lda.perplexity(vect_df)
lda.score(vect_df)
topics[2545]
df.ix[2545].text
Esempio n. 40
0
n_features = 1000
n_topics = 10
n_top_words = 20

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

lda.fit(corpusVect)

tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


lda.score(corpusVect)
lda.perplexity(corpusVect)

#### Titles

corp2 = dataWeek.title
CleanTextTransformer().fit(corp2)
corpCTT2 = CleanTextTransformer().transform(corp2)

corpCTTvect = vectorizer.fit_transform(corpCTT2)
corpusTitlesVect = pd.DataFrame(corpCTTvect.todense(),columns=vectorizer.get_feature_names())

lda2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

for n in range(2,16):
Esempio n. 41
0
    f = plt.figure()
    plt.matshow(topics, cmap = 'gray')   
    plt.gca().set_aspect('auto')
    plt.title('learned topic matrix')
    plt.ylabel('topics')
    plt.xlabel('dictionary')
    plt.show()
    f.savefig('./figures/topic.png')
     
    #topic proportions matrix: D x K
    #note: np.sum(H, axis=1) is not 1
    H = lda_vb.transform(A_tfidf_sp)
    
    f = plt.figure()
    plt.matshow(H, cmap = 'gray')   
    plt.gca().set_aspect('auto')
    plt.show()
    plt.title('topic proportions')
    plt.xlabel('topics')
    plt.ylabel('documents')
    f.savefig('./figures/proportions.png')
                
    #compute perplexity
    print "perplexity: %.2f" % lda_vb.perplexity(A_tfidf_sp)    
    plot_perplexity_iter(A_tfidf_sp, num_topics)
    plot_perplexity_topics(A_tfidf_sp)
    plot_perplexity_batch(A_tfidf_sp, A_tfidf_sp.shape[0])

    print "LDA topics:"
    display_topics(lda_vb, tfidf_dict, 20)
            
        for i in range(int(max_iter / valid_iter)):
            train_s = []
            test_s = []
            train_p = []
            test_p = []

            print '\ntraining ', i * valid_iter + 1, '-th iteration'

            for train_index, test_index in splited_index:
                train_data, test_data = dataset[train_index], dataset[test_index]
                lda_model.partial_fit(train_data)

                train_s.append(lda_model.score(train_data))
                test_s.append(lda_model.score(test_data))

                train_p.append(lda_model.perplexity(train_data))
                test_p.append(lda_model.perplexity(test_data))

            train_scores.append(train_s)
            test_scores.append(test_s)
            train_perplexities.append(train_p)
            test_perplexities.append(test_p)

            print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i]


        dict_num_topic[str(n_component) + '_topics'] = {
            "max_iter": max_iter, "valid_iter": valid_iter,
            "train_scores": train_scores, "test_scores": test_scores,
            "train_perplexities": train_perplexities, "test_perplexities": test_perplexities
        }
Esempio n. 43
0
    max_iter=10,  # Max learning iterations
    random_state=100,  # Random state (seed)
    learning_method='online',
    batch_size=128,  # No of docs in each iter
    evaluate_every=-1,  # Compute perplexity every n iters
    n_jobs=-1)  # Use all available CPUs

lda_output = lda_model.fit_transform(samples)
print(lda_model)

# Diagnose model performance with perplexity and log-likelihood
# Log Likelyhood: Higher the better
print "Log Likelihood: ", lda_model.score(samples)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(samples))

# See model parameters
pprint(lda_model.get_params())

# Perform GridSearch for the best LDA model
# Define Search Param
search_params = {
    'n_components': [6, 7, 8, 9],  # take 10 topics
    'learning_decay': [0.5, 0.7, 0.9],
    'max_iter': [6, 7, 8, 9],
    'random_state': [2018]
}

# Init the Model
lda = LatentDirichletAllocation()
Esempio n. 44
0
tf = tf_vectorizer.fit_transform(blogs.article_body)



lda_eval2 = []

ldaRANGE = [9,10,11,12,13,14,15,16,17,18,19,20,30,40,50,60,70,80,90,100,150,200,300]

for n in ldaRANGE:
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)
    score = lda.score(tf)
    perplexity = lda.perplexity(tf)
    print n,score,perplexity
    lda_eval2.append({'topics':n,'score':score,'perplexity':perplexity})

for item in lda_eval2:
    print item

lda_eval22 = pd.DataFrame(lda_eval2)

lda_eval22

import matplotlib.pyplot as plt

lda_eval22
plt.style.use('ggplot')
plt.scatter(lda_eval22['topics'],lda_eval22['perplexity'])