def perform_experiment(( train_n_dw_matrix, test_n_dw_matrix, optimizer, T, samples, init_iters, output_path )): init_optimizer = default.Optimizer([regularizers.Trivial()] * init_iters) callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix ) init_optimizer.iteration_callback = callback optimizer.iteration_callback = callback for seed in range(samples): print(seed) plsa_phi, plsa_theta = experiments.default_sample( train_n_dw_matrix=train_n_dw_matrix, T=T, seed=seed, optimizer=init_optimizer, finish_launch=False, ) optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta) if optimizer.iteration_callback: optimizer.iteration_callback.finish_launch() optimizer.iteration_callback.save_results(output_path)
def basic_experiment(data_list, tau_list, num_topics): "tau_list должен содержать None для сравнения с default" "data_list - адреса сохраненных матриц n_dw" regularization_list = [regularizers.Trivial()] * ITERS_COUNT for data_addr in data_list: with open(data_addr, 'rb') as f: data = pickle.load(f) data = sparse.csr_matrix(np.array(data)) # исходная генерация генерирует именно list for t in tau_list: perform_experiment( data, None, default.Optimizer(regularization_list), 100, SAMPLES, output_path = 'exp_{}_{}.pkl'.format(t, data_addr), tau = t, path_phi_output = 'expphi_{}_{}.pkl'.format(t, data_addr) )
def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, T, num_2_token): train_corpus = [zip(row.indices, row.data) for row in train_n_dw_matrix] for seed in [42, 7, 777, 12]: model = LdaModel(train_corpus, alpha='auto', id2word=num_2_token, num_topics=T, iterations=500, random_state=seed) gensim_phi = exp_common.get_phi(model) gensim_theta = exp_common.get_theta(train_corpus, model) print('gensim perplexity') print(np.exp(-model.log_perplexity(train_corpus))) D, W = train_n_dw_matrix.shape random_gen = np.random.RandomState(seed) phi = common.get_prob_matrix_by_counters( random_gen.uniform(size=(T, W)).astype(np.float64)) theta = common.get_prob_matrix_by_counters( np.ones(shape=(D, T)).astype(np.float64)) phi, theta = default.Optimizer([regularizers.Additive(0.1, 0.)] * 100, verbose=False).run( train_n_dw_matrix, phi, theta) callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix, top_pmi_sizes=[5, 10, 20, 30], top_avg_jaccard_sizes=[10, 50, 100, 200], measure_time=True) callback.start_launch() callback(0, phi, theta) callback(1, gensim_phi, gensim_theta) print('artm') for name, values in callback.launch_result.items(): print('\t{}: {}'.format(name, values[0])) print('gensim') for name, values in callback.launch_result.items(): print('\t{}: {}'.format(name, values[1]))
topic_0_indices.append(index) elif target == 1: topic_1_indices.append(index) thetaless_rels = [] lda_rels = [] for balance in range(10, 201, 10): print(balance) n_dw_matrix = _n_dw_matrix[topic_0_indices + topic_1_indices * balance, :] regularization_list = [regularizers.Additive(-0.1, 0.)] * 100 lda_phi, lda_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=default.Optimizer(regularization_list, verbose=False)) thetaless_phi, thetaless_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=thetaless.Optimizer(regularization_list, verbose=False)) # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean()) # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean()) # print('!') # for topic_set in metrics.get_top_words(thetaless_phi, 10): # print('\n\t'.join(map(num_2_token.get, topic_set))) # print() # for topic_set in metrics.get_top_words(thetaless_phi, 5): # print('\n\t'.join(map(num_2_token.get, topic_set))) # print() print('lda')
train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 25]: for theta_alpha in [0.1, 0.01, 0.1]: regularization_list = [regularizers.Additive(0, theta_alpha) ] * ITERS_COUNT args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_default_{}_{}.pkl'.format( T, 0., theta_alpha))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, thetaless.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_thetaless_{}_{}.pkl'.format( T, 0., theta_alpha))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, transfer_thetaless.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_transfer_thetaless_{}_{}.pkl'. format(T, 0., theta_alpha))) #manager.perform_experiment(args_list[0]) #manager.perform_experiment(args_list[1])
from pyartm.optimizations import default import manager if __name__ == '__main__': train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 30]: for tau in [1e7, 1e8, 1.5e8, 2e8, 2.5e8, 3e8, 3.5e8, 4e8, 4.5e8, 5e8]: for use_old_phi in [True]: # [False, True] regularization_list = [ regularizers.Combination( regularizers.Decorrelator(tau, use_old_phi), regularizers.Additive(-0.01, -0.01), ) ] * 500 args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(regularization_list), T, 10, '20news_experiment/20news_{}t_{}_{}.pkl'.format( T, int(tau), use_old_phi))) Pool(processes=8).map(manager.perform_experiment, args_list)
def get_optimizer(phi_alpha, iters_count): return default.Optimizer( [regularizers.Additive(phi_alpha, 0.)] * iters_count )
train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 25]: plsa_list = [regularizers.Trivial()] * ITERS_COUNT sparse_lda_list = [regularizers.Additive(-1, 0.)] * ITERS_COUNT args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(sparse_lda_list), T, SAMPLES, INIT_ITERS, '20news_experiment/20news_{}t_post_lda.pkl'.format(T))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, obd.Optimizer( plsa_list, gamma_tw_min_delta=1, ), T, SAMPLES, INIT_ITERS, '20news_experiment/20news_{}t_post_obd_limited.pkl'.format(T))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, naive_obd.Optimizer( plsa_list, gamma_tw_min_delta=1, ), T, SAMPLES, INIT_ITERS, '20news_experiment/20news_{}t_post_naive_obd_limited.pkl'.format(
def perform_doc_experiment( (n_dw_matrix_doc_train, doc_targets_doc_train, n_dw_matrix_doc_test, doc_targets_doc_test, optimizer, T, samples, output_path)): D, _ = n_dw_matrix_doc_test.shape svm_train_score = metrics.create_svm_score_function(doc_targets_doc_train, verbose=False) opt_plsa_not_const_phi = default.Optimizer( regularization_list=optimizer.regularization_list[:10], const_phi=False) opt_plsa_const_phi = default.Optimizer( regularization_list=optimizer.regularization_list[:10], const_phi=True) opt_artm_thetaless = thetaless.Optimizer( regularization_list=optimizer.regularization_list[:10]) res_plsa_not_const_phi = [] res_plsa_const_phi = [] res_artm_thetaless = [] cv_fold_scores = [] cv_test_scores = [] for seed in range(samples): print(seed) phi, theta = experiments.default_sample(n_dw_matrix_doc_train, T, seed, optimizer) (best_C, best_gamma, cv_fold_score, cv_test_score) = svm_train_score(theta) cv_fold_scores.append(cv_fold_score) cv_test_scores.append(cv_test_score) print('Fold score: {}\tTest score: {}'.format(cv_fold_score, cv_test_score)) algo = SVC(C=best_C, gamma=best_gamma).fit(theta, doc_targets_doc_train) init_theta = common.get_prob_matrix_by_counters( np.ones(shape=(D, T), dtype=np.float64)) plsa_not_const_phi = [] plsa_const_phi = [] artm_thetaless = [] opt_plsa_not_const_phi.iteration_callback = ( lambda it, phi, theta: plsa_not_const_phi.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) opt_plsa_const_phi.iteration_callback = ( lambda it, phi, theta: plsa_const_phi.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) opt_artm_thetaless.iteration_callback = ( lambda it, phi, theta: artm_thetaless.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) for opt in [ opt_plsa_not_const_phi, opt_plsa_const_phi, opt_artm_thetaless ]: opt.run(n_dw_matrix_doc_test, phi, init_theta) res_plsa_not_const_phi.append(plsa_not_const_phi) res_plsa_const_phi.append(plsa_const_phi) res_artm_thetaless.append(artm_thetaless) callbacks.save_results( { 'res_plsa_not_const_phi': res_plsa_not_const_phi, 'res_plsa_const_phi': res_plsa_const_phi, 'res_artm_thetaless': res_artm_thetaless, 'cv_fold_scores': cv_fold_scores, 'cv_test_scores': cv_test_scores }, output_path)
from multiprocessing import Pool from pyartm_datasets import main_cases from pyartm import regularizers from pyartm.optimizations import default import manager if __name__ == '__main__': train_n_dw_matrix = main_cases.get_20newsgroups( ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])[0] args_list = list() T = 10 for phi_alpha in [-10**(-i) for i in range(30)]: for theta_alpha in [-0.1, 0., 0.1]: regularization_list = [ regularizers.Additive(phi_alpha, theta_alpha) ] * 100 args_list.append( (train_n_dw_matrix, default.Optimizer(regularization_list), T, 100, 'alpha_exp/20news_{}t_{}_{}.pkl'.format( T, phi_alpha, theta_alpha))) Pool(processes=5).map(manager.perform_alpha_dependency_experiment, args_list)