def grid_search(parameters, X_train_res, y_train_res, X_test, y_test, useTrainCV=False): xgbmodel = XGBRegressor() kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10) grid_search_xg = GridSearchCV(xgbmodel, parameters, scoring='roc_auc', n_jobs=1, cv=kfold, verbose=1) result_gcv_xgb = grid_search_xg.fit(X_train_res, y_train_res) best_params = result_gcv_xgb.best_params_ print("Best params: %s" % (best_params)) # rebuild using best params xg_reg = XGBRegressor(objective=best_params['objective'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], n_estimators=best_params['n_estimators'], min_child_weight=best_params['min_child_weight'], gamma=best_params['gamma'], colsample_bytree=best_params['colsample_bytree'], subsample=best_params['subsample'], reg_alpha=best_params['reg_alpha']) if useTrainCV: xgb_param = xg_reg.get_xgb_params() xgtrain = DMatrix(X_train_res, label=y_train_res) cvresult = cv(xgb_param, xgtrain, num_boost_round=xg_reg.get_params()['n_estimators'], folds=kfold, metrics='auc', early_stopping_rounds=20) xg_reg.set_params(n_estimators=cvresult.shape[0]) print("Best number of estimators: %i" % (cvresult.shape[0])) eval_set = [(X_test, y_test)] xg_reg.fit(X_train_res, y_train_res, eval_metric="error", eval_set=eval_set, verbose=False) y_pred_train = xg_reg.predict(X_train_res) #print("Accuracy train: %f" % (accuracy_score(y_train_res, y_pred_train))) #print("Recall train: %f" % (recall_score(y_train_res, y_pred_train))) #print("Precision train: %f" % (precision_score(y_train_res, y_pred_train))) print("AUC train: %f" % (roc_auc_score(y_train_res, y_pred_train))) y_pred = xg_reg.predict(X_test) #print("Accuracy test: %f" % (accuracy_score(y_test, y_pred))) #print("Recall test: %f" % (recall_score(y_test, y_pred))) #print("Precision test: %f" % (precision_score(y_test, y_pred))) print("AUC test: %f" % (roc_auc_score(y_test, y_pred)))
class XGBoostRegressor(Model): def create_model(self): self.xgb_regressor = XGBRegressor() def fit(self, train_x, train_y): self.xgb_regressor.fit(train_x, train_y) def set_config(self, config): self.xgb_regressor.set_params(**config) def predict(self, test_x): return self.xgb_regressor.predict(test_x)
def fit_gbm(data, fixed_gbm_params, variable_gbm_params): gbm_parameters = deepcopy(variable_gbm_params) gbm_parameters.update(fixed_gbm_params) gbm = XGBRegressor(objective="reg:gamma") gbm.set_params(**gbm_parameters) gbm.fit( data["X_train_encoded"], data["y_train"], early_stopping_rounds=30, eval_metric="mae", eval_set=[(data["X_test_encoded"], data["y_test"]), (data["X_holdout_encoded"], data["y_holdout"])], ) return gbm
def create_model(model_type='xgb', model_params=None): if model_params is None: model_params = get_default_params(model_type=model_type) else: pass if model_type == 'lgb': model = lgb.LGBMRegressor() model.set_params(**model_params) if model_type == 'xgb': model = XGBRegressor() model.set_params(**model_params) return model
class XGBaseline(BaseEstimator, RegressorMixin): def __init__(self, **kwargs): self.xgb_mean = XGBRegressor(**kwargs) def fit(self, X, y): self.xgb_mean.fit(X, y) errors = y - self.xgb_mean.predict(X) self.std = np.std(errors) return self def predict(self, X, y=None): pred_mean = self.xgb_mean.predict(X) pred_std = self.std * np.ones(len(pred_mean)) return pred_mean, pred_std def get_params(self, deep=True): return self.xgb_mean.get_params() def set_params(self, **params): self.xgb_mean.set_params(**params) return self
class RegressionLearner: def __init__(self, **kwargs): self.estimator = XGBRegressor(**kwargs) self.fit_info = None # noinspection PyPep8Naming # pylint: disable-msg=too-many-arguments # pylint: disable-msg=too-many-locals # pylint: disable-msg=invalid-name def fit(self, X, y): # If there is no evaluation data, split some. x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) if X.shape[0] < 10000: best_param = search_parameters(self.estimator, x_train, y_train) self.estimator.set_params(**best_param) self.estimator.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False) y_train_pred = self.predict(x_train) train_r2 = sklearn.metrics.r2_score(y_train, y_train_pred) y_test_pred = self.predict(x_test) test_r2 = sklearn.metrics.r2_score(y_test, y_test_pred) self.fit_info = 'Train/Test R2: {:.2f}/{:.2f}'.format( train_r2, test_r2) return self def predict(self, x): return self.estimator.predict(x)
class XGBLogLikelihood(BaseEstimator, RegressorMixin): def __init__(self, **kwargs): self.xgb_mean = XGBRegressor(**kwargs) kwargs["objective"] = ll_objective self.xgb_log_var = XGBRegressor(**kwargs) def fit(self, X, y): self.xgb_mean.fit(X, y) errors = y - self.xgb_mean.predict(X) self.xgb_log_var.fit(X, errors) return self def predict(self, X, y=None): pred_mean = self.xgb_mean.predict(X) pred_std = np.exp(self.xgb_log_var.predict(X) / 2) return pred_mean, pred_std def get_params(self, deep=True): return self.xgb_mean.get_params() def set_params(self, **params): self.xgb_mean.set_params(**params) self.xgb_log_var.set_params(**params) return self
def fun_xgb_fs(x, *args): X, y, flag, n_splits, random_seed = args clf = XGBRegressor(random_state=int(random_seed)) n_samples, n_var = X.shape cr = { 0: 'reg:squarederror', 1: 'reg:logistic', 2: 'binary:logistic', } #x=[0.1, 200, 5, 0.3, 2, 0.8, ] p = { 'learning_rate': x[0], 'n_estimators': int(round(x[1])), 'max_depth': int(round(x[2])), 'colsample_bytree': x[3], 'min_child_weight': int(round(x[4])), 'subsample': int(x[5] * 1000) / 1000, #'alpha':x[6], 'objective': cr[0], #'presort':ps[0], #'max_iter':1000, } clf.set_params(**p) #x[2::] = [1 if k>0.5 else 0 for k in x[4::]] if len(x) <= 6: ft = np.array([1 for i in range(n_var)]) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) try: # print('Começando KFold', flag) cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) #print('Terminando KFold', flag) y_p = cross_val_predict(clf, X, np.ravel(y), cv=cv, n_jobs=1) r = RMSE(y_p, y) r2 = MAPE(y_p, y) r3 = RRMSE(y_p, y) r4 = -r2_score(y_p, y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') #r = -precision_score(y,y_p) #print(r,p) except: y_p = [None] r = 1e12 #print(r,'\t',p) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'XGB', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed, 'ERROR_TRAIN': { 'RMSE': r, 'MAPE': r2, 'RRMSE': r3, 'R2_SCORE': r4 } }
thresholds = np.sort(model_.best_estimator_.feature_importances_) print(thresholds) print('=======================') break score_list = [] for i, thresh in enumerate(thresholds): selection = SelectFromModel(best_model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) print(select_x_train.shape) selection_model = XGBRegressor(n_jobs=8) selection_model.set_params(**model_.best_params_) selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) score = r2_score(y_test, y_predict) score_list.append(score) print('Thresh=%.3f, n=%d, R2:%.2f%%' % (thresh, select_x_train.shape[1], score * 100)) if i == 0: reduce_x_train = select_x_train reduce_x_test = select_x_test
"id": "colsample_bytree", "conditions": np.linspace(0.3, 1, 5) }, } boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target) scores = [] reg = XGBRegressor(silent=1, nthread=-1) thompson_parameters = ThompsonParameters(xbg_tuning_parameters, 80, 20) while thompson_parameters.hasNext(): params_obj = thompson_parameters.getParameters() cur_parameters = dict(params_obj["parameters"]) cur_parameters["max_depth"] = int(round(cur_parameters["max_depth"][0])) cur_parameters["subsample"] = cur_parameters["subsample"][0] print(cur_parameters["max_depth"], cur_parameters["subsample"]) reg.set_params(**cur_parameters) reg.fit(X_train, y_train) score = reg.score(X_test, y_test) thompson_parameters.setScore(params_obj, score) scores.append(score) print( zip(thompson_parameters.bayes_opt["max_depth"].X, thompson_parameters.bayes_opt["max_depth"].Y)) plt.plot(scores) plt.show()
def main(): list_file_path = sorted( glob.glob(os.path.join(DATA_DIR, 'train_join_all_5_cl_qua/*gz'))) df = pandas.read_csv(list_file_path[0], compression='gzip') df = df.fillna(0) data = df[LIST_FEATURE_COLUMN_NAME].values target = df[TARGET_COLUMN_NAME].values model = XGBRegressor(seed=0) """ params = {'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 1], 'min_child_weight': [0.01, 0.1, 1], 'subsample': [0.1, 0.5, 1], 'colsample_bytree': [0.3, 0.5, 1], } cv = GridSearchCV(model, params, scoring=bimbo_scoring, n_jobs=3, refit=False, verbose=10) cv.fit(data, target) """ params = { 'subsample': 1, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'max_depth': 13, 'min_child_weight': 0.01 } logger.info('best_params: %s' % params) list_estimator = [] flg = 0 for i in range(1, len(list_file_path)): logger.info('%s: %s' % (i, list_file_path[i])) test_df = pandas.read_csv(list_file_path[i], compression='gzip') test_df = test_df.fillna(0) test_data = test_df[LIST_FEATURE_COLUMN_NAME].values test_target = test_df[TARGET_COLUMN_NAME].values if flg < 4: data = numpy.r_[data, test_data] target = numpy.r_[target, test_target] flg += 1 continue else: flg = 0 model = XGBRegressor(seed=0) model.set_params(**params) model.fit(data, target) list_estimator.append(model) if 1: predict = numpy.mean([est.predict(data) for est in list_estimator], axis=0) predict = numpy.where(predict < 0, 0, predict) score = bimbo_score_func(predict, target) logger.info('INSAMPLE score: %s' % score) predict = numpy.mean( [est.predict(test_data) for est in list_estimator], axis=0) predict = numpy.where(predict < 0, 0, predict) score = bimbo_score_func(predict, test_target) logger.info('score: %s' % score) # model.set_params(n_estimators=n_estimators) df = test_df data = test_data target = test_target with open('list_xgb_model_5_cl_qua.pkl', 'wb') as f: pickle.dump(list_estimator, f, -1)
#%% #after we have searched the best set of hyper-parameter, we now apply all the data to train the model with cross-validation #we want to test for classification, regression or stacking, which one is the best. cv = StratifiedKFold(5, random_state=model_random_state) updated_dict = gridsearch.best_params_ updated_dict['learning_rate'] = .1 updated_dict['n_estimators'] = 800 updated_dict['min_child_weight'] = 50 #%% print('===============XGboost regression with rounding===============') xgbr = XGBRegressor(random_state=model_random_state, n_jobs=-1, early_stopping_rounds=80) xgbr.set_params(**updated_dict) xgbr_scores = evaluation.cv_scores(xgbr, X_train, y_train, cv=cv, scoring=quadratic_weighted_kappa_round, return_estimator=True) # train mean of score: 0.6473931873941305 # train std of score: 0.001041262887225388 # test mean of score: 0.6063831053298916 # test std of score: 0.003201456042199307 #%% print('===============XGboost regression with decision tree===============') #It is very easy for Stacking to get overfitted, so we reduce the model complexity here
subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=27) xgb_param = gb.get_xgb_params() xgtrain = xgb.DMatrix(df[features].values, label=df['SPEED_AVG'].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=gb.get_params()['n_estimators'], nfold=10, metrics='mae', early_stopping_rounds=50) gb.set_params(n_estimators=cvresult.shape[0]) gb.fit(x_train, y_train, eval_metric='mae') def mean_absolute_percentage_error(y_true, y_pred): y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 predictions = gb.predict(x_train) print("MAE Score (Train): %f" % mean_absolute_error(y_train, predictions)) print("MAE Score (Test): %f" % cvresult['test-mae-mean'].tail(1))