def run_grid_search_multicore():
    if not os.path.exists(os.path.join(data_dir(), dataset + '_train.libsvm')):
        dataset_info = data_info(dataset)
        get_file(dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash'])

    params = {'regular_param': [3.8639876352395671e-06],
              'gamma': [0.015625]}
    candidate_params_lst = list(ParameterGrid(params))

    pool = mp.Pool(processes=20)  # maximum of workers
    for candidate_params in candidate_params_lst:
        pool.apply_async(run_one_candidate, args=(candidate_params,), callback=log_result)
    # for candidate_params in candidate_params_lst:
    #     run_one_candidate(candidate_params)

    pool.close()
    pool.join()

    if len(candidate_params_lst) > 1:
        print("========== FINAL RESULT ==========")
        idx_best = np.argmin(np.array(mistake_rate_lst))
        print('Data set: {}'.format(dataset))
        print('Oracle: {}'.format('budget'))
        print('Loss func: {}'.format(loss_func))
        print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best]))
        print('Best params: {}'.format(run_param_lst[idx_best]))
        print('Time per candidate param: {}'.format(time_lst[idx_best]))
Esempio n. 2
0
def load_mnist(shuffle_data=True, randseed='default'):
    train_path = get_file("mnist_train.libsvm", origin=remote_data_dir() + "/mnist_train.libsvm",
                          cache_subdir="demo")
    test_path = get_file("mnist_test.libsvm", origin=remote_data_dir() + "/mnist_test.libsvm",
                         cache_subdir="demo")

    x_train, y_train = load_svmlight_file(train_path, n_features=784)
    x_test, y_test = load_svmlight_file(test_path, n_features=784)
    x_train = x_train.toarray() / 255.0
    x_test = x_test.toarray() / 255.0

    if shuffle_data:
        shuffle(x_train, y_train, randseed=randseed)

    return (x_train, y_train), (x_test, y_test)
Esempio n. 3
0
def load_cifar10(shuffle_data=True, randseed='default'):
    train_path = get_file("cifar10_5k_train.pkl",
                          origin=remote_data_dir() + "/cifar10_5k_train.pkl",
                          cache_subdir="demo")
    test_path = get_file("cifar10_1k_test.pkl",
                         origin=remote_data_dir() + "/cifar10_1k_test.pkl",
                         cache_subdir="demo")

    tmp = pickle.load(open(train_path, "rb"))
    x_train, y_train = tmp['data'] / 255.0, tmp['labels']
    tmp = pickle.load(open(test_path, "rb"))
    x_test, y_test = tmp['data'] / 255.0, tmp['labels']

    if shuffle_data:
        shuffle(x_train, y_train, randseed=randseed)

    return (x_train, y_train), (x_test, y_test)
Esempio n. 4
0
def load_housing(shuffle_data=True, randseed='default'):
    train_path = get_file("housing_scale_train.libsvm",
                          origin=remote_data_dir() + "/housing_scale_train.libsvm",
                          cache_subdir="demo")
    test_path = get_file("housing_scale_test.libsvm",
                         origin=remote_data_dir() + "/housing_scale_test.libsvm",
                         cache_subdir="demo")

    x_train, y_train = load_svmlight_file(train_path, n_features=13)
    x_test, y_test = load_svmlight_file(test_path, n_features=13)
    x_train = x_train.toarray()
    x_test = x_test.toarray()

    if shuffle_data:
        shuffle(x_train, y_train, randseed=randseed)

    return (x_train, y_train), (x_test, y_test)
Esempio n. 5
0
def run_grid_search_multicore(
        create_obj_func, params_gridsearch, attribute_names, dataset=None, num_workers=4, file_config=None,
        num_runs=3, cross=0, num_features=None, full_dataset=None, keep_vars=[], ind_test=None, max_torrance=1):
    if dataset is None:
        if len(sys.argv) > 2:
            dataset = sys.argv[2]
        else:
            raise Exception('Not specify dataset')

    params_gridsearch = parse_arguments(params_gridsearch, True)
    # print(params_gridsearch)
    file_config, params_gridsearch = extract_param('file_config', file_config, params_gridsearch)
    num_workers, params_gridsearch = extract_param('num_workers', num_workers, params_gridsearch)
    num_runs, params_gridsearch = extract_param('num_runs', num_runs, params_gridsearch)
    cross, params_gridsearch = extract_param('cross', cross, params_gridsearch)
    num_features, params_gridsearch = extract_param('num_features', num_features, params_gridsearch)
    full_dataset, params_gridsearch = extract_param('full_dataset', full_dataset, params_gridsearch)
    ind_test, params_gridsearch = extract_param('ind_test', ind_test, params_gridsearch)
    max_torrance, params_gridsearch = extract_param('max_torrance', max_torrance, params_gridsearch)
    if ind_test is not None:
        if full_dataset is None:
            ind_test = dataset
        else:
            ind_test = full_dataset

    if full_dataset is None:
        full_dataset = dataset

    if not os.path.exists(os.path.join(data_dir(), full_dataset + '_train.libsvm')):
        dataset_info = data_info(full_dataset)
        get_file(full_dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash'])

    candidate_params_lst = list(ParameterGrid(params_gridsearch))
    grid_search = True
    if len(candidate_params_lst) == 1:
        grid_search = False

    pool = mp.Pool(num_workers)  # maximum of workers
    result_lst = []
    for candidate_params in candidate_params_lst:
        result = pool.apply_async(
            run_one_candidate,
            args=(
                create_obj_func, candidate_params, dataset, attribute_names, file_config, num_runs, cross,
                num_features, keep_vars, ind_test, grid_search, max_torrance, online),
            callback=log_result
        )
        result_lst.append(result)

    for result in result_lst:
        result.get()
    pool.close()
    pool.join()

    if len(candidate_params_lst) > 1:
        print("========== FINAL RESULT ==========")
        if online:
            idx_best = np.argmin(np.array(mistake_rate_lst))
        else:
            idx_best = np.argmax(np.array(test_acc_lst))
        print('Data set: {}'.format(dataset))
        print('Best testid: {}'.format(testid_lst[idx_best]))
        if online:
            print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best]))
        else:
            print('Best err on training set: {}'.format(1-train_acc_lst[idx_best]))
            print('Best err on valid set: {}'.format(1-test_acc_lst[idx_best]))
        print('Best params: {}'.format(run_param_lst[idx_best]))

        if cross > 0:
            print('Run the best one')
            num_runs_for_best = num_runs
            if num_runs < 3:
                num_runs_for_best = 3
            best_result = run_one_candidate(
                create_obj_func, run_param_lst[idx_best], full_dataset, attribute_names, file_config, num_runs_for_best,
                cross=0, num_features=num_features, keep_vars=keep_vars, online=online)
            # best_result['gridsearch_time'] = np.sum(np.array(time_lst))
            log_result(best_result)