def run_grid_search_multicore(): if not os.path.exists(os.path.join(data_dir(), dataset + '_train.libsvm')): dataset_info = data_info(dataset) get_file(dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash']) params = {'regular_param': [3.8639876352395671e-06], 'gamma': [0.015625]} candidate_params_lst = list(ParameterGrid(params)) pool = mp.Pool(processes=20) # maximum of workers for candidate_params in candidate_params_lst: pool.apply_async(run_one_candidate, args=(candidate_params,), callback=log_result) # for candidate_params in candidate_params_lst: # run_one_candidate(candidate_params) pool.close() pool.join() if len(candidate_params_lst) > 1: print("========== FINAL RESULT ==========") idx_best = np.argmin(np.array(mistake_rate_lst)) print('Data set: {}'.format(dataset)) print('Oracle: {}'.format('budget')) print('Loss func: {}'.format(loss_func)) print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best])) print('Best params: {}'.format(run_param_lst[idx_best])) print('Time per candidate param: {}'.format(time_lst[idx_best]))
def load_mnist(shuffle_data=True, randseed='default'): train_path = get_file("mnist_train.libsvm", origin=remote_data_dir() + "/mnist_train.libsvm", cache_subdir="demo") test_path = get_file("mnist_test.libsvm", origin=remote_data_dir() + "/mnist_test.libsvm", cache_subdir="demo") x_train, y_train = load_svmlight_file(train_path, n_features=784) x_test, y_test = load_svmlight_file(test_path, n_features=784) x_train = x_train.toarray() / 255.0 x_test = x_test.toarray() / 255.0 if shuffle_data: shuffle(x_train, y_train, randseed=randseed) return (x_train, y_train), (x_test, y_test)
def load_cifar10(shuffle_data=True, randseed='default'): train_path = get_file("cifar10_5k_train.pkl", origin=remote_data_dir() + "/cifar10_5k_train.pkl", cache_subdir="demo") test_path = get_file("cifar10_1k_test.pkl", origin=remote_data_dir() + "/cifar10_1k_test.pkl", cache_subdir="demo") tmp = pickle.load(open(train_path, "rb")) x_train, y_train = tmp['data'] / 255.0, tmp['labels'] tmp = pickle.load(open(test_path, "rb")) x_test, y_test = tmp['data'] / 255.0, tmp['labels'] if shuffle_data: shuffle(x_train, y_train, randseed=randseed) return (x_train, y_train), (x_test, y_test)
def load_housing(shuffle_data=True, randseed='default'): train_path = get_file("housing_scale_train.libsvm", origin=remote_data_dir() + "/housing_scale_train.libsvm", cache_subdir="demo") test_path = get_file("housing_scale_test.libsvm", origin=remote_data_dir() + "/housing_scale_test.libsvm", cache_subdir="demo") x_train, y_train = load_svmlight_file(train_path, n_features=13) x_test, y_test = load_svmlight_file(test_path, n_features=13) x_train = x_train.toarray() x_test = x_test.toarray() if shuffle_data: shuffle(x_train, y_train, randseed=randseed) return (x_train, y_train), (x_test, y_test)
def run_grid_search_multicore( create_obj_func, params_gridsearch, attribute_names, dataset=None, num_workers=4, file_config=None, num_runs=3, cross=0, num_features=None, full_dataset=None, keep_vars=[], ind_test=None, max_torrance=1): if dataset is None: if len(sys.argv) > 2: dataset = sys.argv[2] else: raise Exception('Not specify dataset') params_gridsearch = parse_arguments(params_gridsearch, True) # print(params_gridsearch) file_config, params_gridsearch = extract_param('file_config', file_config, params_gridsearch) num_workers, params_gridsearch = extract_param('num_workers', num_workers, params_gridsearch) num_runs, params_gridsearch = extract_param('num_runs', num_runs, params_gridsearch) cross, params_gridsearch = extract_param('cross', cross, params_gridsearch) num_features, params_gridsearch = extract_param('num_features', num_features, params_gridsearch) full_dataset, params_gridsearch = extract_param('full_dataset', full_dataset, params_gridsearch) ind_test, params_gridsearch = extract_param('ind_test', ind_test, params_gridsearch) max_torrance, params_gridsearch = extract_param('max_torrance', max_torrance, params_gridsearch) if ind_test is not None: if full_dataset is None: ind_test = dataset else: ind_test = full_dataset if full_dataset is None: full_dataset = dataset if not os.path.exists(os.path.join(data_dir(), full_dataset + '_train.libsvm')): dataset_info = data_info(full_dataset) get_file(full_dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash']) candidate_params_lst = list(ParameterGrid(params_gridsearch)) grid_search = True if len(candidate_params_lst) == 1: grid_search = False pool = mp.Pool(num_workers) # maximum of workers result_lst = [] for candidate_params in candidate_params_lst: result = pool.apply_async( run_one_candidate, args=( create_obj_func, candidate_params, dataset, attribute_names, file_config, num_runs, cross, num_features, keep_vars, ind_test, grid_search, max_torrance, online), callback=log_result ) result_lst.append(result) for result in result_lst: result.get() pool.close() pool.join() if len(candidate_params_lst) > 1: print("========== FINAL RESULT ==========") if online: idx_best = np.argmin(np.array(mistake_rate_lst)) else: idx_best = np.argmax(np.array(test_acc_lst)) print('Data set: {}'.format(dataset)) print('Best testid: {}'.format(testid_lst[idx_best])) if online: print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best])) else: print('Best err on training set: {}'.format(1-train_acc_lst[idx_best])) print('Best err on valid set: {}'.format(1-test_acc_lst[idx_best])) print('Best params: {}'.format(run_param_lst[idx_best])) if cross > 0: print('Run the best one') num_runs_for_best = num_runs if num_runs < 3: num_runs_for_best = 3 best_result = run_one_candidate( create_obj_func, run_param_lst[idx_best], full_dataset, attribute_names, file_config, num_runs_for_best, cross=0, num_features=num_features, keep_vars=keep_vars, online=online) # best_result['gridsearch_time'] = np.sum(np.array(time_lst)) log_result(best_result)