Example #1
0
def main():
    rand_state = 12
    cpt_file = os.path.join('checkpoints', 'test.txt')
    cpt_file2 = os.path.join('checkpoints', 'test2.txt')
    # make dataset
    x_all, y_all = make_regression(n_samples=1000,
                                   n_features=3,
                                   n_informative=3,
                                   n_targets=1,
                                   shuffle=True,
                                   random_state=rand_state)
    # split
    x_tr, x_test, y_tr, y_test = train_test_split(x_all,
                                                  y_all,
                                                  test_size=0.2,
                                                  random_state=rand_state)
    # fit
    model = regbm.Boosting(min_bins=256,
                           max_bins=256,
                           no_early_stopping=True,
                           thread_cnt=1)
    model.fit(x_train=x_tr,
              y_train=y_tr,
              x_valid=x_test,
              y_valid=y_test,
              tree_count=2,
              tree_depth=2,
              feature_fold_size=1.0,
              learning_rate=0.5,
              random_state=rand_state)
    preds = model.predict(x_test)
    mae = mae_score(y_test, preds)
    print(f"MAE: {mae}")
    model.save_model(cpt_file)
    loaded = regbm.Boosting(filename=cpt_file, thread_cnt=1)
    loaded.save_model(cpt_file2)
    preds = loaded.predict(x_test)
    mae_new = mae_score(y_test, preds)
    print(f"Saved & loaded MAE: {mae}")
    print(f"Test passed: {np.isclose(mae, mae_new)}")
    print("Finish")
Example #2
0
def refit_jt(params_file, x_train, x_valid, y_train, y_valid, random_seed):
    # read params from file
    params_jt = json_load_utf8(params_file)
    # fit model
    ctor_options, fit_options = split_options(params_jt)
    model = regbm.Boosting(**ctor_options)
    start_time = time.time()
    model.fit(x_train=x_train,
              y_train=y_train,
              x_valid=x_valid,
              y_valid=y_valid,
              **fit_options)
    fit_time = time.time() - start_time
    return model, fit_time
Example #3
0
def regbm_tuned_mae(x_tr_val, y_tr_val, x_test, y_test, best_params,
                    preds_dict):
    ctor_options, fit_options = split_options(best_params)
    model = regbm.Boosting(**ctor_options)
    history = model.fit(x_train=x_tr_val,
                        y_train=y_tr_val,
                        x_valid=x_test,
                        y_valid=y_test,
                        **fit_options)
    preds = model.predict(x_test)
    if (preds == np.inf).any():  # filter outliers
        return None, None
    mae = mae_score(y_test, preds)
    preds_dict["regbm"] = preds
    return mae, np.std(np.abs(preds - y_test))
Example #4
0
 def fit_wrapper():
     model = regbm.Boosting(model_options['min_bins'],
                            model_options['max_bins'],
                            model_options['patience'], False,
                            THREAD_COUNT)
     start_time = time.time(
     )  # get start time to count the time of execution
     history = model.fit(
         x_train, y_train, x_valid, y_valid,
         model_options['tree_count'], model_options['tree_depth'],
         model_options['feature_fold_size'],
         model_options['learning_rate'], model_options['reg'],
         model_options['es_delta'], model_options['batch_part'],
         model_options['random_batches'],
         model_options['random_hist_thresholds'],
         model_options['remove_regularization_later'])
     exec_time = time.time() - start_time
     if out_options['verbose'] >= 1:
         print(f"Fit time = {exec_time} seconds")
     return model, history
Example #5
0
def tune_regbm(x_tr_val, y_tr_val, options_grid, random_state=12):
    keys_list = list(options_grid.keys())
    options_count = len(keys_list)
    cur_idx_each_option = [0] * options_count
    cur_prop_to_change = 0
    # the dictionary is the seed for the data frame
    # init the dictionary
    tuning_df = {keys_list[i]: [] for i in range(options_count)}
    tuning_df['MAE'] = []
    tuning_df['time'] = []
    cv_fit_number = get_fit_steps(options_grid)
    print(f"The number of iterations to tune JIT trees model: {cv_fit_number}")
    split_cnt_limit = 5  # K < 5 (in K Fold) => #valid < 0.2 #tr_val
    repeats_cnt = cv_fit_number // split_cnt_limit + 1  # repeats * splits >= cv_fit_number
    kf_cv = RepeatedKFold(n_splits=split_cnt_limit,
                          n_repeats=repeats_cnt,
                          random_state=random_state)  # init CV
    # iterate over splits, but stop when the whole grid will be studied
    iters_gone = 0
    for train_idxs, valid_idxs in kf_cv.split(x_tr_val):
        print(f"Current tuning iteration: {iters_gone + 1} / {cv_fit_number}")
        # get current options
        model_options = {
            keys_list[i]: options_grid[keys_list[i]][cur_idx_each_option[i]]
            for i in range(options_count)
        }

        # get train and test sets
        x_train, x_valid = x_tr_val[train_idxs], x_tr_val[valid_idxs]
        y_train, y_valid = y_tr_val[train_idxs], y_tr_val[valid_idxs]

        # fit
        ctor_options, fit_options = split_options(model_options)
        model = regbm.Boosting(**ctor_options)
        start_time = time.time()
        history = model.fit(x_train=x_train,
                            y_train=y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            **fit_options)
        exec_time = time.time() - start_time

        # evaluate
        preds = model.predict(x_valid)
        if np.isnan(preds).any() or (preds == np.inf).any():
            mae = np.inf
        else:
            try:
                mae = mae_score(y_valid, preds)
            except Exception as m:
                mae = np.inf

        # add to the dictionary (to save in the data frame later)
        for key in keys_list:
            tuning_df[key].append(model_options[key])
        tuning_df['MAE'].append(mae)
        tuning_df['time'].append(exec_time)

        # update options' indexes
        while cur_prop_to_change < options_count and cur_idx_each_option[
                cur_prop_to_change] + 1 >= len(
                    options_grid[keys_list[cur_prop_to_change]]):
            # find the next changable option
            cur_prop_to_change += 1
        if cur_prop_to_change >= options_count:
            # we have seen all the options, can finish
            break
        for prev_prop in range(cur_prop_to_change):
            # set all previous options to 0
            cur_idx_each_option[prev_prop] = 0
        cur_idx_each_option[
            cur_prop_to_change] += 1  # increment the current option
        # reduce index to the start (lexicographic order)
        cur_prop_to_change = 0
        # update iterations counter
        iters_gone += 1
        if iters_gone >= cv_fit_number:
            break  # can finish tuning

    # return the resulting data frame
    tuning_df = pd.DataFrame(tuning_df)  # convert to DF
    best_idx = tuning_df['MAE'].idxmin()  # get minimum by MAE score
    best_params = tuning_df.iloc[[best_idx]].to_dict()
    for key in best_params.keys():
        # convert dictionaries to params (forget indexes)
        best_params[key] = list(best_params[key].values())[0]
    # return the data frame (protocol) and the best parameters dictionary (with mae and exec time)
    return tuning_df, best_params