Ejemplo n.º 1
0
def perform_experiment((
   train_n_dw_matrix, test_n_dw_matrix, optimizer,
   T, samples, init_iters, output_path
)):
    init_optimizer = default.Optimizer([regularizers.Trivial()] * init_iters)
    callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        test_n_dw_matrix=test_n_dw_matrix
    )
    init_optimizer.iteration_callback = callback
    optimizer.iteration_callback = callback
    for seed in range(samples):
        print(seed)
        plsa_phi, plsa_theta = experiments.default_sample(
            train_n_dw_matrix=train_n_dw_matrix,
            T=T,
            seed=seed,
            optimizer=init_optimizer,
            finish_launch=False,
        )
        optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta)
        if optimizer.iteration_callback:
            optimizer.iteration_callback.finish_launch()

    optimizer.iteration_callback.save_results(output_path)
Ejemplo n.º 2
0
def basic_experiment(data_list, tau_list, num_topics):
    "tau_list должен содержать None для сравнения с default"
    "data_list - адреса сохраненных матриц n_dw"
    regularization_list = [regularizers.Trivial()] * ITERS_COUNT
    for data_addr in data_list:
        with open(data_addr, 'rb') as f:
            data = pickle.load(f)
        data = sparse.csr_matrix(np.array(data)) # исходная генерация генерирует именно list
        for t in tau_list:
            perform_experiment(
                data, None, default.Optimizer(regularization_list), 100, 
                SAMPLES, output_path = 'exp_{}_{}.pkl'.format(t, data_addr), tau = t, 
                path_phi_output = 'expphi_{}_{}.pkl'.format(t, data_addr)
            )
Ejemplo n.º 3
0
def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, T, num_2_token):
    train_corpus = [zip(row.indices, row.data) for row in train_n_dw_matrix]

    for seed in [42, 7, 777, 12]:
        model = LdaModel(train_corpus,
                         alpha='auto',
                         id2word=num_2_token,
                         num_topics=T,
                         iterations=500,
                         random_state=seed)
        gensim_phi = exp_common.get_phi(model)
        gensim_theta = exp_common.get_theta(train_corpus, model)
        print('gensim perplexity')
        print(np.exp(-model.log_perplexity(train_corpus)))

        D, W = train_n_dw_matrix.shape
        random_gen = np.random.RandomState(seed)
        phi = common.get_prob_matrix_by_counters(
            random_gen.uniform(size=(T, W)).astype(np.float64))
        theta = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T)).astype(np.float64))
        phi, theta = default.Optimizer([regularizers.Additive(0.1, 0.)] * 100,
                                       verbose=False).run(
                                           train_n_dw_matrix, phi, theta)

        callback = experiments.default_callback(
            train_n_dw_matrix=train_n_dw_matrix,
            test_n_dw_matrix=test_n_dw_matrix,
            top_pmi_sizes=[5, 10, 20, 30],
            top_avg_jaccard_sizes=[10, 50, 100, 200],
            measure_time=True)
        callback.start_launch()
        callback(0, phi, theta)
        callback(1, gensim_phi, gensim_theta)

        print('artm')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[0]))

        print('gensim')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[1]))
Ejemplo n.º 4
0
            topic_0_indices.append(index)
        elif target == 1:
            topic_1_indices.append(index)

    thetaless_rels = []
    lda_rels = []
    for balance in range(10, 201, 10):
        print(balance)
        n_dw_matrix = _n_dw_matrix[topic_0_indices +
                                   topic_1_indices * balance, :]
        regularization_list = [regularizers.Additive(-0.1, 0.)] * 100
        lda_phi, lda_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=default.Optimizer(regularization_list, verbose=False))
        thetaless_phi, thetaless_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=thetaless.Optimizer(regularization_list, verbose=False))
        # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean())
        # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean())
        # print('!')
        # for topic_set in metrics.get_top_words(thetaless_phi, 10):
        #     print('\n\t'.join(map(num_2_token.get, topic_set)))
        #     print()
        # for topic_set in metrics.get_top_words(thetaless_phi, 5):
        #     print('\n\t'.join(map(num_2_token.get, topic_set)))
        #     print()
        print('lda')
Ejemplo n.º 5
0
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 25]:
        for theta_alpha in [0.1, 0.01, 0.1]:
            regularization_list = [regularizers.Additive(0, theta_alpha)
                                   ] * ITERS_COUNT
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 default.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_default_{}_{}.pkl'.format(
                     T, 0., theta_alpha)))
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 thetaless.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_thetaless_{}_{}.pkl'.format(
                     T, 0., theta_alpha)))
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 transfer_thetaless.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_transfer_thetaless_{}_{}.pkl'.
                 format(T, 0., theta_alpha)))

    #manager.perform_experiment(args_list[0])
    #manager.perform_experiment(args_list[1])
Ejemplo n.º 6
0
from pyartm.optimizations import default

import manager

if __name__ == '__main__':
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 30]:
        for tau in [1e7, 1e8, 1.5e8, 2e8, 2.5e8, 3e8, 3.5e8, 4e8, 4.5e8, 5e8]:
            for use_old_phi in [True]:  # [False, True]
                regularization_list = [
                    regularizers.Combination(
                        regularizers.Decorrelator(tau, use_old_phi),
                        regularizers.Additive(-0.01, -0.01),
                    )
                ] * 500
                args_list.append(
                    (train_n_dw_matrix, test_n_dw_matrix,
                     default.Optimizer(regularization_list), T, 10,
                     '20news_experiment/20news_{}t_{}_{}.pkl'.format(
                         T, int(tau), use_old_phi)))

    Pool(processes=8).map(manager.perform_experiment, args_list)
Ejemplo n.º 7
0
def get_optimizer(phi_alpha, iters_count):
    return default.Optimizer(
        [regularizers.Additive(phi_alpha, 0.)] * iters_count
    )
Ejemplo n.º 8
0
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 25]:
        plsa_list = [regularizers.Trivial()] * ITERS_COUNT
        sparse_lda_list = [regularizers.Additive(-1, 0.)] * ITERS_COUNT

        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             default.Optimizer(sparse_lda_list), T, SAMPLES, INIT_ITERS,
             '20news_experiment/20news_{}t_post_lda.pkl'.format(T)))
        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             obd.Optimizer(
                 plsa_list,
                 gamma_tw_min_delta=1,
             ), T, SAMPLES, INIT_ITERS,
             '20news_experiment/20news_{}t_post_obd_limited.pkl'.format(T)))
        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             naive_obd.Optimizer(
                 plsa_list,
                 gamma_tw_min_delta=1,
             ), T, SAMPLES, INIT_ITERS,
             '20news_experiment/20news_{}t_post_naive_obd_limited.pkl'.format(
Ejemplo n.º 9
0
def perform_doc_experiment(
    (n_dw_matrix_doc_train, doc_targets_doc_train, n_dw_matrix_doc_test,
     doc_targets_doc_test, optimizer, T, samples, output_path)):
    D, _ = n_dw_matrix_doc_test.shape
    svm_train_score = metrics.create_svm_score_function(doc_targets_doc_train,
                                                        verbose=False)
    opt_plsa_not_const_phi = default.Optimizer(
        regularization_list=optimizer.regularization_list[:10],
        const_phi=False)
    opt_plsa_const_phi = default.Optimizer(
        regularization_list=optimizer.regularization_list[:10], const_phi=True)
    opt_artm_thetaless = thetaless.Optimizer(
        regularization_list=optimizer.regularization_list[:10])

    res_plsa_not_const_phi = []
    res_plsa_const_phi = []
    res_artm_thetaless = []
    cv_fold_scores = []
    cv_test_scores = []

    for seed in range(samples):
        print(seed)
        phi, theta = experiments.default_sample(n_dw_matrix_doc_train, T, seed,
                                                optimizer)
        (best_C, best_gamma, cv_fold_score,
         cv_test_score) = svm_train_score(theta)
        cv_fold_scores.append(cv_fold_score)
        cv_test_scores.append(cv_test_score)

        print('Fold score: {}\tTest score: {}'.format(cv_fold_score,
                                                      cv_test_score))
        algo = SVC(C=best_C, gamma=best_gamma).fit(theta,
                                                   doc_targets_doc_train)
        init_theta = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T), dtype=np.float64))

        plsa_not_const_phi = []
        plsa_const_phi = []
        artm_thetaless = []

        opt_plsa_not_const_phi.iteration_callback = (
            lambda it, phi, theta: plsa_not_const_phi.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))
        opt_plsa_const_phi.iteration_callback = (
            lambda it, phi, theta: plsa_const_phi.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))
        opt_artm_thetaless.iteration_callback = (
            lambda it, phi, theta: artm_thetaless.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))

        for opt in [
                opt_plsa_not_const_phi, opt_plsa_const_phi, opt_artm_thetaless
        ]:
            opt.run(n_dw_matrix_doc_test, phi, init_theta)

        res_plsa_not_const_phi.append(plsa_not_const_phi)
        res_plsa_const_phi.append(plsa_const_phi)
        res_artm_thetaless.append(artm_thetaless)

    callbacks.save_results(
        {
            'res_plsa_not_const_phi': res_plsa_not_const_phi,
            'res_plsa_const_phi': res_plsa_const_phi,
            'res_artm_thetaless': res_artm_thetaless,
            'cv_fold_scores': cv_fold_scores,
            'cv_test_scores': cv_test_scores
        }, output_path)
from multiprocessing import Pool

from pyartm_datasets import main_cases
from pyartm import regularizers
from pyartm.optimizations import default

import manager

if __name__ == '__main__':
    train_n_dw_matrix = main_cases.get_20newsgroups(
        ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])[0]
    args_list = list()
    T = 10
    for phi_alpha in [-10**(-i) for i in range(30)]:
        for theta_alpha in [-0.1, 0., 0.1]:
            regularization_list = [
                regularizers.Additive(phi_alpha, theta_alpha)
            ] * 100
            args_list.append(
                (train_n_dw_matrix, default.Optimizer(regularization_list), T,
                 100, 'alpha_exp/20news_{}t_{}_{}.pkl'.format(
                     T, phi_alpha, theta_alpha)))

    Pool(processes=5).map(manager.perform_alpha_dependency_experiment,
                          args_list)