def main(): rand_state = 12 cpt_file = os.path.join('checkpoints', 'test.txt') cpt_file2 = os.path.join('checkpoints', 'test2.txt') # make dataset x_all, y_all = make_regression(n_samples=1000, n_features=3, n_informative=3, n_targets=1, shuffle=True, random_state=rand_state) # split x_tr, x_test, y_tr, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=rand_state) # fit model = regbm.Boosting(min_bins=256, max_bins=256, no_early_stopping=True, thread_cnt=1) model.fit(x_train=x_tr, y_train=y_tr, x_valid=x_test, y_valid=y_test, tree_count=2, tree_depth=2, feature_fold_size=1.0, learning_rate=0.5, random_state=rand_state) preds = model.predict(x_test) mae = mae_score(y_test, preds) print(f"MAE: {mae}") model.save_model(cpt_file) loaded = regbm.Boosting(filename=cpt_file, thread_cnt=1) loaded.save_model(cpt_file2) preds = loaded.predict(x_test) mae_new = mae_score(y_test, preds) print(f"Saved & loaded MAE: {mae}") print(f"Test passed: {np.isclose(mae, mae_new)}") print("Finish")
def refit_jt(params_file, x_train, x_valid, y_train, y_valid, random_seed): # read params from file params_jt = json_load_utf8(params_file) # fit model ctor_options, fit_options = split_options(params_jt) model = regbm.Boosting(**ctor_options) start_time = time.time() model.fit(x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid, **fit_options) fit_time = time.time() - start_time return model, fit_time
def regbm_tuned_mae(x_tr_val, y_tr_val, x_test, y_test, best_params, preds_dict): ctor_options, fit_options = split_options(best_params) model = regbm.Boosting(**ctor_options) history = model.fit(x_train=x_tr_val, y_train=y_tr_val, x_valid=x_test, y_valid=y_test, **fit_options) preds = model.predict(x_test) if (preds == np.inf).any(): # filter outliers return None, None mae = mae_score(y_test, preds) preds_dict["regbm"] = preds return mae, np.std(np.abs(preds - y_test))
def fit_wrapper(): model = regbm.Boosting(model_options['min_bins'], model_options['max_bins'], model_options['patience'], False, THREAD_COUNT) start_time = time.time( ) # get start time to count the time of execution history = model.fit( x_train, y_train, x_valid, y_valid, model_options['tree_count'], model_options['tree_depth'], model_options['feature_fold_size'], model_options['learning_rate'], model_options['reg'], model_options['es_delta'], model_options['batch_part'], model_options['random_batches'], model_options['random_hist_thresholds'], model_options['remove_regularization_later']) exec_time = time.time() - start_time if out_options['verbose'] >= 1: print(f"Fit time = {exec_time} seconds") return model, history
def tune_regbm(x_tr_val, y_tr_val, options_grid, random_state=12): keys_list = list(options_grid.keys()) options_count = len(keys_list) cur_idx_each_option = [0] * options_count cur_prop_to_change = 0 # the dictionary is the seed for the data frame # init the dictionary tuning_df = {keys_list[i]: [] for i in range(options_count)} tuning_df['MAE'] = [] tuning_df['time'] = [] cv_fit_number = get_fit_steps(options_grid) print(f"The number of iterations to tune JIT trees model: {cv_fit_number}") split_cnt_limit = 5 # K < 5 (in K Fold) => #valid < 0.2 #tr_val repeats_cnt = cv_fit_number // split_cnt_limit + 1 # repeats * splits >= cv_fit_number kf_cv = RepeatedKFold(n_splits=split_cnt_limit, n_repeats=repeats_cnt, random_state=random_state) # init CV # iterate over splits, but stop when the whole grid will be studied iters_gone = 0 for train_idxs, valid_idxs in kf_cv.split(x_tr_val): print(f"Current tuning iteration: {iters_gone + 1} / {cv_fit_number}") # get current options model_options = { keys_list[i]: options_grid[keys_list[i]][cur_idx_each_option[i]] for i in range(options_count) } # get train and test sets x_train, x_valid = x_tr_val[train_idxs], x_tr_val[valid_idxs] y_train, y_valid = y_tr_val[train_idxs], y_tr_val[valid_idxs] # fit ctor_options, fit_options = split_options(model_options) model = regbm.Boosting(**ctor_options) start_time = time.time() history = model.fit(x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid, **fit_options) exec_time = time.time() - start_time # evaluate preds = model.predict(x_valid) if np.isnan(preds).any() or (preds == np.inf).any(): mae = np.inf else: try: mae = mae_score(y_valid, preds) except Exception as m: mae = np.inf # add to the dictionary (to save in the data frame later) for key in keys_list: tuning_df[key].append(model_options[key]) tuning_df['MAE'].append(mae) tuning_df['time'].append(exec_time) # update options' indexes while cur_prop_to_change < options_count and cur_idx_each_option[ cur_prop_to_change] + 1 >= len( options_grid[keys_list[cur_prop_to_change]]): # find the next changable option cur_prop_to_change += 1 if cur_prop_to_change >= options_count: # we have seen all the options, can finish break for prev_prop in range(cur_prop_to_change): # set all previous options to 0 cur_idx_each_option[prev_prop] = 0 cur_idx_each_option[ cur_prop_to_change] += 1 # increment the current option # reduce index to the start (lexicographic order) cur_prop_to_change = 0 # update iterations counter iters_gone += 1 if iters_gone >= cv_fit_number: break # can finish tuning # return the resulting data frame tuning_df = pd.DataFrame(tuning_df) # convert to DF best_idx = tuning_df['MAE'].idxmin() # get minimum by MAE score best_params = tuning_df.iloc[[best_idx]].to_dict() for key in best_params.keys(): # convert dictionaries to params (forget indexes) best_params[key] = list(best_params[key].values())[0] # return the data frame (protocol) and the best parameters dictionary (with mae and exec time) return tuning_df, best_params