def run_gridsearch_single_core(): print("========== Tune parameters for IGKOL for classification ==========") data_name = 'svmguide1' n_features = 4 train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm') test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm') print(train_file_name) print(test_file_name) if not os.path.exists(train_file_name): raise Exception('File not found') if not os.path.exists(test_file_name): raise Exception('File not found') x_train, y_train = load_svmlight_file(train_file_name, n_features=n_features) x_test, y_test = load_svmlight_file(test_file_name, n_features=n_features) x_train = x_train.toarray() x_test = x_test.toarray() x_total = np.vstack((x_train, x_test)) y_total = np.concatenate((y_train, y_test)) print("Number of total samples = {}".format(x_total.shape[0])) params = {'regular_param': [0.0001, 0.00001], 'gamma': [0.25, 0.5, 1, 2]} candidate_params_lst = list(ParameterGrid(params)) mistake_rate_lst = [] run_param_lst = [] for candidate_params in candidate_params_lst: clf = OnlineDualSVRG( # regular_param=0.01, learning_rate_scale=0.8, # gamma=2.0, rf_dim=400, num_epochs=1, freq_update_full_model=100, oracle='coverage', core_max=100, coverage_radius=0.9, loss_func='logistic', smooth_hinge_theta=0.5, smooth_hinge_tau=0.5, random_state=3333, **candidate_params, ) clf.fit(x_total, y_total) print('Params:', candidate_params, 'Mistake rate:', clf.mistake_rate) mistake_rate_lst.append(clf.mistake_rate) run_param_lst.append(candidate_params) idx_best = np.argmin(np.array(mistake_rate_lst)) print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best])) print('Best params: {}'.format(run_param_lst[idx_best]))
def run_one_candidate(candidate_params): np.random.seed(random_seed()) mistake_rate_avg = 0 train_time_avg = 0 num_runs = 3 for ri in range(num_runs): print('----------------------------------') print('Run #{0}:'.format(ri + 1)) # np.random.seed(5555) # np.random.seed(4444) data_name = dataset n_features = 300 train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm') test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm') if not os.path.exists(train_file_name): raise Exception('File not found') if not os.path.exists(test_file_name): raise Exception('File not found') x_train, y_train = load_svmlight_file(train_file_name, n_features=n_features) x_test, y_test = load_svmlight_file(test_file_name, n_features=n_features) x_train = x_train.toarray() x_test = x_test.toarray() x_total = np.vstack((x_train, x_test)) y_total = np.concatenate((y_train, y_test)) print('Num total samples: {}'.format(x_total.shape[0])) clf = OnlineDualSVRG( # regular_param=0.01, learning_rate_scale=0.8, # gamma=2.0, rf_dim=4000, num_epochs=1, freq_update_full_model=100, oracle=oracle, core_max=100, coverage_radius=40.0, loss_func=loss_func, smooth_hinge_theta=0.5, smooth_hinge_tau=0.5, random_state=3333, **candidate_params, ) print('Running ...') clf.fit(x_total, y_total) print('Mistake rate: {0:.2f}%, Training time: {1} seconds'.format( clf.mistake_rate * 100, int(clf.train_time))) mistake_rate_avg += clf.mistake_rate train_time_avg += clf.train_time return mistake_rate_avg / num_runs, train_time_avg / num_runs, candidate_params
def run_grid_search_multicore(): if not os.path.exists(os.path.join(data_dir(), dataset + '_train.libsvm')): dataset_info = data_info(dataset) get_file(dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash']) params = {'regular_param': [3.8639876352395671e-06], 'gamma': [0.015625]} candidate_params_lst = list(ParameterGrid(params)) pool = mp.Pool(processes=20) # maximum of workers for candidate_params in candidate_params_lst: pool.apply_async(run_one_candidate, args=(candidate_params,), callback=log_result) # for candidate_params in candidate_params_lst: # run_one_candidate(candidate_params) pool.close() pool.join() if len(candidate_params_lst) > 1: print("========== FINAL RESULT ==========") idx_best = np.argmin(np.array(mistake_rate_lst)) print('Data set: {}'.format(dataset)) print('Oracle: {}'.format('budget')) print('Loss func: {}'.format(loss_func)) print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best])) print('Best params: {}'.format(run_param_lst[idx_best])) print('Time per candidate param: {}'.format(time_lst[idx_best]))