def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, T, num_2_token):
    train_corpus = [zip(row.indices, row.data) for row in train_n_dw_matrix]

    for seed in [42, 7, 777, 12]:
        model = LdaModel(train_corpus,
                         alpha='auto',
                         id2word=num_2_token,
                         num_topics=T,
                         iterations=500,
                         random_state=seed)
        gensim_phi = exp_common.get_phi(model)
        gensim_theta = exp_common.get_theta(train_corpus, model)
        print('gensim perplexity')
        print(np.exp(-model.log_perplexity(train_corpus)))

        D, W = train_n_dw_matrix.shape
        random_gen = np.random.RandomState(seed)
        phi = common.get_prob_matrix_by_counters(
            random_gen.uniform(size=(T, W)).astype(np.float64))
        theta = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T)).astype(np.float64))
        phi, theta = default.Optimizer([regularizers.Additive(0.1, 0.)] * 100,
                                       verbose=False).run(
                                           train_n_dw_matrix, phi, theta)

        callback = experiments.default_callback(
            train_n_dw_matrix=train_n_dw_matrix,
            test_n_dw_matrix=test_n_dw_matrix,
            top_pmi_sizes=[5, 10, 20, 30],
            top_avg_jaccard_sizes=[10, 50, 100, 200],
            measure_time=True)
        callback.start_launch()
        callback(0, phi, theta)
        callback(1, gensim_phi, gensim_theta)

        print('artm')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[0]))

        print('gensim')
        for name, values in callback.launch_result.items():
            print('\t{}: {}'.format(name, values[1]))
Example #2
0
from pyartm import regularizers
from pyartm_datasets import main_cases
from pyartm.optimizations import timed_default

import manager

if __name__ == '__main__':
    n_dw_matrix = main_cases.get_20newsgroups([
        'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
        'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
        'sci.space'
    ])[0]
    manager.perform_experiment(
        n_dw_matrix,
        timed_default.Optimizer(
            regularization_list=[regularizers.Additive(0., 0.)] * 100,
            return_counters=True), 10, 100)
Example #3
0
ITERS_COUNT = 100
SAMPLES = 5

if __name__ == '__main__':
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 25]:
        for theta_alpha in [0.1, 0.01, 0.1]:
            regularization_list = [regularizers.Additive(0, theta_alpha)
                                   ] * ITERS_COUNT
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 default.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_default_{}_{}.pkl'.format(
                     T, 0., theta_alpha)))
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 thetaless.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_thetaless_{}_{}.pkl'.format(
                     T, 0., theta_alpha)))
            args_list.append(
                (train_n_dw_matrix, test_n_dw_matrix,
                 transfer_thetaless.Optimizer(regularization_list), T, SAMPLES,
                 '20news_experiment/20news_{}t_transfer_thetaless_{}_{}.pkl'.
        'talk.politics.guns',
    ])
    topic_0_indices, topic_1_indices = [], []
    for index, target in enumerate(doc_targets):
        if target == 0:
            topic_0_indices.append(index)
        elif target == 1:
            topic_1_indices.append(index)

    thetaless_rels = []
    lda_rels = []
    for balance in range(10, 201, 10):
        print(balance)
        n_dw_matrix = _n_dw_matrix[topic_0_indices +
                                   topic_1_indices * balance, :]
        regularization_list = [regularizers.Additive(-0.1, 0.)] * 100
        lda_phi, lda_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=default.Optimizer(regularization_list, verbose=False))
        thetaless_phi, thetaless_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=thetaless.Optimizer(regularization_list, verbose=False))
        # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean())
        # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean())
        # print('!')
        # for topic_set in metrics.get_top_words(thetaless_phi, 10):
        #     print('\n\t'.join(map(num_2_token.get, topic_set)))
Example #5
0
import manager

ITERS_COUNT = 100
SAMPLES = 50

if __name__ == '__main__':
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_nips(
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [20, 50]:
        for phi_alpha in [-0.1, 0., 0.1]:
            for theta_alpha in [-0.1, 0., 0.1]:
                regularization_list = [
                    regularizers.Additive(phi_alpha, theta_alpha)
                ] * ITERS_COUNT
                args_list.append(
                    (train_n_dw_matrix, test_n_dw_matrix,
                     default.Optimizer(regularization_list), T, SAMPLES,
                     'nips_experiment/NIPS_{}t_base_{}_{}.pkl'.format(
                         T, phi_alpha, theta_alpha)))
                args_list.append(
                    (train_n_dw_matrix, test_n_dw_matrix,
                     naive_thetaless.Optimizer(regularization_list), T,
                     SAMPLES,
                     'nips_experiment/NIPS_{}t_naive_{}_{}.pkl'.format(
                         T, phi_alpha, theta_alpha)))
                for use_B_cheat in [False, True]:
                    args_list.append(
                        (train_n_dw_matrix, test_n_dw_matrix,
Example #6
0
from pyartm.optimizations import default

import manager

if __name__ == '__main__':
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 30]:
        for tau in [1e7, 1e8, 1.5e8, 2e8, 2.5e8, 3e8, 3.5e8, 4e8, 4.5e8, 5e8]:
            for use_old_phi in [True]:  # [False, True]
                regularization_list = [
                    regularizers.Combination(
                        regularizers.Decorrelator(tau, use_old_phi),
                        regularizers.Additive(-0.01, -0.01),
                    )
                ] * 500
                args_list.append(
                    (train_n_dw_matrix, test_n_dw_matrix,
                     default.Optimizer(regularization_list), T, 10,
                     '20news_experiment/20news_{}t_{}_{}.pkl'.format(
                         T, int(tau), use_old_phi)))

    Pool(processes=8).map(manager.perform_experiment, args_list)
Example #7
0
def get_optimizer(phi_alpha, iters_count):
    return default.Optimizer(
        [regularizers.Additive(phi_alpha, 0.)] * iters_count
    )
Example #8
0
SAMPLES = 100
INIT_ITERS = 100

if __name__ == '__main__':
    train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups(
        [
            'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
            'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
            'sci.space'
        ],
        train_proportion=0.8)[:2]

    args_list = list()
    for T in [10, 25]:
        plsa_list = [regularizers.Trivial()] * ITERS_COUNT
        sparse_lda_list = [regularizers.Additive(-1, 0.)] * ITERS_COUNT

        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             default.Optimizer(sparse_lda_list), T, SAMPLES, INIT_ITERS,
             '20news_experiment/20news_{}t_post_lda.pkl'.format(T)))
        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             obd.Optimizer(
                 plsa_list,
                 gamma_tw_min_delta=1,
             ), T, SAMPLES, INIT_ITERS,
             '20news_experiment/20news_{}t_post_obd_limited.pkl'.format(T)))
        args_list.append(
            (train_n_dw_matrix, test_n_dw_matrix,
             naive_obd.Optimizer(
Example #9
0
from pyartm import regularizers
from pyartm.optimizations import default
from pyartm.optimizations import thetaless
from pyartm.optimizations import naive_thetaless
from pyartm.optimizations import obd

if __name__ == '__main__':
    # train_n_dw_matrix = sparse.csr_matrix(np.array([
    #     [1, 1, 1, 0],
    #     [1, 0, 1, 1],
    #     [1, 1, 0, 1],
    # ]))
    train_n_dw_matrix = sparse.csr_matrix(
        np.random.RandomState(42).uniform(0, 1, size=(100, 1000)) < 0.3)
    regularization_list = [regularizers.Trivial()] * 500
    extra_opt = obd.Optimizer([regularizers.Additive(0, 0)] * 500,
                              verbose=False)

    for module in [default, thetaless, naive_thetaless]:
        print(module.__name__)
        optimizer = module.Optimizer(regularization_list, verbose=False)
        D, W = train_n_dw_matrix.shape
        T = 4
        random_gen = np.random.RandomState(47)
        phi_matrix = common.get_prob_matrix_by_counters(
            random_gen.uniform(size=(T, W)).astype(np.float64))
        theta_matrix = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T)).astype(np.float64))
        phi_matrix, theta_matrix = optimizer.run(train_n_dw_matrix, phi_matrix,
                                                 theta_matrix)
        mod_phi_matrix, mod_theta_matrix = extra_opt.run(