Example #1
0
def run_gridsearch_single_core():
    print("========== Tune parameters for IGKOL for classification ==========")

    data_name = 'svmguide1'
    n_features = 4

    train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm')
    test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm')

    print(train_file_name)
    print(test_file_name)

    if not os.path.exists(train_file_name):
        raise Exception('File not found')
    if not os.path.exists(test_file_name):
        raise Exception('File not found')

    x_train, y_train = load_svmlight_file(train_file_name,
                                          n_features=n_features)
    x_test, y_test = load_svmlight_file(test_file_name, n_features=n_features)

    x_train = x_train.toarray()
    x_test = x_test.toarray()

    x_total = np.vstack((x_train, x_test))
    y_total = np.concatenate((y_train, y_test))

    print("Number of total samples = {}".format(x_total.shape[0]))

    params = {'regular_param': [0.0001, 0.00001], 'gamma': [0.25, 0.5, 1, 2]}

    candidate_params_lst = list(ParameterGrid(params))
    mistake_rate_lst = []
    run_param_lst = []
    for candidate_params in candidate_params_lst:
        clf = OnlineDualSVRG(
            # regular_param=0.01,
            learning_rate_scale=0.8,
            # gamma=2.0,
            rf_dim=400,
            num_epochs=1,
            freq_update_full_model=100,
            oracle='coverage',
            core_max=100,
            coverage_radius=0.9,
            loss_func='logistic',
            smooth_hinge_theta=0.5,
            smooth_hinge_tau=0.5,
            random_state=3333,
            **candidate_params,
        )
        clf.fit(x_total, y_total)
        print('Params:', candidate_params, 'Mistake rate:', clf.mistake_rate)

        mistake_rate_lst.append(clf.mistake_rate)
        run_param_lst.append(candidate_params)

    idx_best = np.argmin(np.array(mistake_rate_lst))
    print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best]))
    print('Best params: {}'.format(run_param_lst[idx_best]))
Example #2
0
def run_one_candidate(candidate_params):
    np.random.seed(random_seed())
    mistake_rate_avg = 0
    train_time_avg = 0
    num_runs = 3
    for ri in range(num_runs):
        print('----------------------------------')
        print('Run #{0}:'.format(ri + 1))

        # np.random.seed(5555)
        # np.random.seed(4444)
        data_name = dataset
        n_features = 300

        train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm')
        test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm')

        if not os.path.exists(train_file_name):
            raise Exception('File not found')
        if not os.path.exists(test_file_name):
            raise Exception('File not found')

        x_train, y_train = load_svmlight_file(train_file_name,
                                              n_features=n_features)
        x_test, y_test = load_svmlight_file(test_file_name,
                                            n_features=n_features)

        x_train = x_train.toarray()
        x_test = x_test.toarray()

        x_total = np.vstack((x_train, x_test))
        y_total = np.concatenate((y_train, y_test))

        print('Num total samples: {}'.format(x_total.shape[0]))

        clf = OnlineDualSVRG(
            # regular_param=0.01,
            learning_rate_scale=0.8,
            # gamma=2.0,
            rf_dim=4000,
            num_epochs=1,
            freq_update_full_model=100,
            oracle=oracle,
            core_max=100,
            coverage_radius=40.0,
            loss_func=loss_func,
            smooth_hinge_theta=0.5,
            smooth_hinge_tau=0.5,
            random_state=3333,
            **candidate_params,
        )
        print('Running ...')
        clf.fit(x_total, y_total)
        print('Mistake rate: {0:.2f}%, Training time: {1} seconds'.format(
            clf.mistake_rate * 100, int(clf.train_time)))
        mistake_rate_avg += clf.mistake_rate
        train_time_avg += clf.train_time
    return mistake_rate_avg / num_runs, train_time_avg / num_runs, candidate_params
def run_grid_search_multicore():
    if not os.path.exists(os.path.join(data_dir(), dataset + '_train.libsvm')):
        dataset_info = data_info(dataset)
        get_file(dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash'])

    params = {'regular_param': [3.8639876352395671e-06],
              'gamma': [0.015625]}
    candidate_params_lst = list(ParameterGrid(params))

    pool = mp.Pool(processes=20)  # maximum of workers
    for candidate_params in candidate_params_lst:
        pool.apply_async(run_one_candidate, args=(candidate_params,), callback=log_result)
    # for candidate_params in candidate_params_lst:
    #     run_one_candidate(candidate_params)

    pool.close()
    pool.join()

    if len(candidate_params_lst) > 1:
        print("========== FINAL RESULT ==========")
        idx_best = np.argmin(np.array(mistake_rate_lst))
        print('Data set: {}'.format(dataset))
        print('Oracle: {}'.format('budget'))
        print('Loss func: {}'.format(loss_func))
        print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best]))
        print('Best params: {}'.format(run_param_lst[idx_best]))
        print('Time per candidate param: {}'.format(time_lst[idx_best]))