test_size=0.2, random_state=99) #2. 모델 구성 model = XGBRegressor(n_estimators=1000, learning_rate=0.1) # n_estimators는 딥러닝의 epochs와 같음 #3. 훈련 model.fit(x_train, y_train, verbose=False, eval_metric="rmse", eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) # 딥러닝의 metrics가 있었음. 머신러닝의 지표는 rmse, mae, logloss, error(<=>acc), auc(정확도 acc의 친구) # error가 0.8이면 acc가 0.2 #4. 평가 result = model.evals_result() print("evals_result : \n", result) # evals_result : # {'validation_0': {'rmse': [22.09964, 20.094713, 18.289314]}, 'validation_1': {'rmse': [21.539825, 19.548641, 17.804596]}} # validation_0 == (x_train,y_train)의 결과 # validation_1 == (x_test, y_test)의 결과 #5. 예측 y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) print("R2 : ", r2) # R2 : 0.823625251495531
# x, y = load_boston(return_X_y=True) datasets = load_boston() x = datasets.data y = datasets['target'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) # 2. 모델 model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=8) # 3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric='rmse', eval_set=[(x_train, y_train), (x_test, y_test)]) aaa = model.score(x_test, y_test) print('aaa :', aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 :', r2) print('====================================') results = model.evals_result() print(results)
print(selection_x_train.shape) selection_model = XGBRegressor(n_estimators=3, n_jobs=-1) selection_model.fit(selection_x_train, y_train, verbose=False, eval_metric=["rmse", "mae"], eval_set=[(selection_x_train, y_train), (selection_x_test, y_test)], early_stopping_rounds=3) y_pred = selection_model.predict(selection_x_test) results = selection_model.evals_result() print("evals_result : \n", results) score = r2_score(y_test, y_pred) print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, selection_x_train.shape[1], score * 100.0)) # (404, 3) # evals_result : # {'validation_0': {'rmse': [17.212723, 12.439525, 9.133449], 'mae': [15.650868, 11.090322, 7.872841]}, # 'validation_1': {'rmse': [16.532173, 11.86516, 8.631524], 'mae': [15.215144, 10.711357, 7.452145]}}
eval_set = [(X_train, y_train), (X_val, y_val)] xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=["rmse"], eval_set=eval_set, verbose=True) # make predictions for test data y_pred = xgb.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions r2_score = r2(y_test, predictions) print("r2_score: %.2f%%" % (r2_score * 100.0)) # retrieve performance metrics results = xgb.evals_result() epochs = len(results['validation_0']['rmse']) x_axis = range(0, epochs) # plot log loss fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['rmse'], label='Train') ax.plot(x_axis, results['validation_1']['rmse'], label='Test') ax.legend() pyplot.ylabel('Root Mean Squared Error') pyplot.title('XGBoost Root Mean Squared Error') pyplot.show() importances = pd.DataFrame({ 'feature': X_train.columns, 'importance': np.round(xgb.feature_importances_, 3) })
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) #2. 모델 model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=-1) #3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric=['rmse', 'logloss', 'mae'], eval_set=[(x_train, y_train), (x_test, y_test)]) #4. 평가 aaa = model.score(x_test, y_test) print("aaa : ", aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) #r2잡을 때 원 데이터가 앞으로 가게? print("r2 : ", r2) # aaa : 0.9329663244922279 # r2 : 0.9329663244922279 print("=======================") results = model.evals_result() # 터미널에서 훈련 셋 지표(rmse)가 줄어드는 과정 표기 print(results)
def xgboost_regress(X_train, y_train, X_test, y_test, early_stopping_rounds=None, plot=True): # Build fit model XG = XGBRegressor(objective='reg:squarederror', n_estimators=200, min_child_weight=1, max_depth=3, subsample=0.7, colsample_bytree=0.5, learning_rate=0.1) eval_set = [(X_train, y_train), (X_test, y_test)] XG.fit(X_train, y_train, eval_metric="rmse", early_stopping_rounds=early_stopping_rounds, eval_set=eval_set, verbose=False) # Make predictions and evaluate preds_train = XG.predict(X_train) preds_test = XG.predict(X_test) rms_train = (mean_squared_error(y_train, preds_train))**0.5 rms_test = (mean_squared_error(y_test, preds_test))**0.5 r2_train = r2_score(y_train, preds_train) r2_test = r2_score(y_test, preds_test) mae_train = mean_absolute_error(y_train, preds_train) mae_test = mean_absolute_error(y_test, preds_test) results = XG.evals_result() epochs = len(results['validation_0']['rmse']) # Plot progress over epochs and final true vs predicted age if plot: fig, ax = plt.subplots(1, 3, figsize=(16, 3.5)) ax[0].scatter(y_train, preds_train, alpha=0.5) ax[0].plot(range(20, 100), range(20, 100), c='red') ax[0].set_xlabel('True Age') ax[0].set_ylabel('Predicted Age') ax[0].grid(True, lw=1.5, ls='--', alpha=0.75) ax[0].set_title('XGboost on training data') ax[1].scatter(y_test, preds_test, alpha=0.5) ax[1].plot(range(20, 100), range(20, 100), c='red') ax[1].set_xlabel('True Age') ax[1].set_ylabel('Predicted Age') ax[1].grid(True, lw=1.5, ls='--', alpha=0.75) ax[1].set_title('XGboost on testing data') x_axis = range(0, epochs) ax[2].plot(x_axis, results['validation_0']['rmse'], label='Train') ax[2].plot(x_axis, results['validation_1']['rmse'], label='Test') ax[2].legend() ax[2].set_ylabel('rms') ax[2].set_xlabel('epoch') ax[2].set_title('XGBoost rms') plt.show() # print metric print(f'The number of training epochs was {epochs}') print(f'The rms on the training data is {rms_train:.3f} years') print(f'The rms on the testing data is {rms_test:.3f} years') print(f'The r^2 on the training data is {r2_train:.3f}') print(f'The r^2 on the testing data is {r2_test:.3f}') print(f'The MAE on the training data is {mae_train:.3f} years') print(f'The MAE on the testing data is {mae_test:.3f} years') return XG, rms_train, rms_test, r2_train, r2_test, XG.feature_importances_
def runXGBRegressorTuning(X_train, X_test, y_train, y_test, scoring='neg_mean_squared_error', cv=5, initial_max_depth=[3, 5, 7, 9], initial_min_child_weight=[1, 3, 5], objective='reg:linear', learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=1, reg_alpha=0, reg_lambda=0, gamma=0, subsample=0.8, colsample_bytree=0.8): # Tune max depth and min child weight - strongest bearing on model tuning best_score = 1000000000 xgb_param_dict = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, reg_alpha=reg_alpha, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, objective=objective, reg_lambda=reg_lambda, nthread=4, scale_pos_weight=1, seed=27) xgb_model = XGBRegressor(**xgb_param_dict) param_test1 = { 'max_depth': initial_max_depth, 'min_child_weight': initial_min_child_weight } gsearch = GridSearchCV(estimator=XGBRegressor(**xgb_param_dict), param_grid=param_test1, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_depth'] = gsearch.best_params_['min_child_depth'] xgb_model = XGBRegressor(**xgb_param_dict) # Decision tree to determine new search ranges if optimal solution found at limit of initial range if gsearch.best_params_['max_depth'] == max(initial_max_depth): print('Best max_depth at max limit of initial range...') new_initial_max_depth = range(max(initial_max_depth), max(initial_max_depth) + 6, 2) elif gsearch.best_params_['max_depth'] == min(initial_max_depth): print('Best max_depth at min limit of initial range...') new_initial_max_depth = range( min(initial_max_depth) - 6, min(initial_max_depth), 2) else: new_initial_max_depth = initial_max_depth if gsearch.best_params_['min_child_weight'] == max( initial_min_child_weight): print('Best min_child_weight at max limit of initial range...') new_initial_min_child_weight = range(max(initial_min_child_weight), max(initial_min_child_weight) + 6, 2) elif gsearch.best_params_['min_child_weight'] == min( initial_min_child_weight): print('Best max_depth at min limit of initial range...') new_initial_min_child_weight = range( min(initial_min_child_weight) - 6, min(initial_min_child_weight), 2) else: new_initial_min_child_weight = initial_min_child_weight # Run various procedures depending on outcome if new_initial_max_depth != initial_min_child_weight or new_initial_max_depth != initial_max_depth: param_test = { 'max_depth': new_initial_max_depth, 'min_child_weight': new_initial_min_child_weight } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_depth'] = gsearch.best_params_[ 'min_child_depth'] xgb_model = XGBRegressor(**xgb_param_dict) else: # Check either side of best variables to check param_test = { 'max_depth': [ xgb_param_dict['max_depth'] - 1, xgb_param_dict['max_depth'], xgb_param_dict['max_depth'] + 1 ], 'min_child_weight': [ xgb_param_dict['min_child_weight'] - 1, xgb_param_dict['min_child_weight'], xgb_param_dict['min_child_weight'] + 1 ] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned max_depth and min_child_weight parameters print('Fine-tuned max_depth and min_child_weight parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_weight'] = gsearch.best_params_[ 'min_child_weight'] xgb_model = XGBRegressor(**xgb_param_dict) warnings = {} # Tune gamma param_test3 = {'gamma': [i / 10.0 for i in range(0, 5)]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test3, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned gamma parameters print('Fine-tuned gamma parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['gamma'] = gsearch.best_params_['gamma'] xgb_model = XGBRegressor(**xgb_param_dict) if xgb_param_dict['gamma'] == max(param_test3['gamma']): warnings[ 'gamma'] = 'gamma: Optimal parameter {} at max of search range'.format( xgb_param_dict['gamma']) # Tune subsample and colsample_bytree param_test4 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test4, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned subsample and colsample_bytree parameters print('Tuned subsample and colsample_bytree parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['subsample'] = gsearch.best_params_['subsample'] xgb_param_dict['colsample_bytree'] = gsearch.best_params_[ 'colsample_bytree'] xgb_model = XGBRegressor(**xgb_param_dict) # while xgb_param_dict['subsample'] == max(param_test4['subsample'] or # xgb_param_dict['colsample_bytree'] == max(param_test4['colsample_bytree']) or # xgb_param_dict['subsample'] == min(param_test4['subsample'] or # xgb_param_dict['colsample_bytree'] == min(param_test4['colsample_bytree']): if xgb_param_dict['subsample'] == max(param_test4['subsample']): warnings[ 'subsample'] = 'subsample: Optimal parameter {} at max of search range'.format( xgb_param_dict['subsample']) elif xgb_param_dict['subsample'] == min(param_test4['subsample']): warnings[ 'subsample'] = 'subsample: Optimal parameter {} at min of search range'.format( xgb_param_dict['subsample']) if xgb_param_dict['colsample_bytree'] == max( param_test4['colsample_bytree']): warnings[ 'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at max of search range'.format( xgb_param_dict['colsample_bytree']) elif xgb_param_dict['colsample_bytreee'] == min( param_test4['colsample_bytree']): warnings[ 'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at min of search range'.format( xgb_param_dict['colsample_bytree']) # Tune regularisation parameters param_test6 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test6, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned regularisation parameters print('Tuned regularisation parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha'] xgb_model = XGBRegressor(**xgb_param_dict) # Fine-tune regularisation parameters param_test7 = { 'reg_alpha': [ float(xgb_param_dict['reg_alpha']) / 10, float(xgb_param_dict['reg_alpha']) / 2, float(xgb_param_dict['reg_alpha']), float(xgb_param_dict['reg_alpha']) * 5, float(xgb_param_dict['reg_alpha']) * 2 ] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test6, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned regularisation parameters print('Tuned regularisation parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score) xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha'] xgb_model = XGBRegressor(**xgb_param_dict) # Tune the learning rate of the model cvresult = xgb.cv(xgb_model.get_params(), X_train, num_boost_round=xgb_model.get_params()['n_estimators'], nfold=cv, metrics='rmse', early_stopping_rounds=50, show_progress=False) # Set the model to the optimal number of estimators wrt early stopping round limit xgb_param_dict['n_estimators'] = cvresult.shape[0] # Learn final XGBoost model xgb_model = XGBRegressor(**xgb_param_dict) xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse', verbose=True) return xgb_model, xgb_model.get_params(), xgb_model.evals_result( ), warnings
xg_model = XGBRegressor(n_estimators=500, learning_rate=0.075, max_depth=7, min_child_weight=5, eval_metric='rmse', seed=1337, objective='reg:squarederror') xg_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False) predictions = xg_model.predict(X_test) max_estimators = len(xg_model.evals_result()['validation_0']['rmse']) print(max_estimators) max_estim_rmse = pd.DataFrame(xg_model.evals_result()['validation_0']['rmse'], columns=['rmse']) plt.plot(max_estim_rmse) plt.ylabel("RMSE") plt.xlabel("Max Estimators") xgb.plot_importance(xg_model) plt.show() # In[21]: rmse_rf = sqrt(mean_squared_error(predictions, y_test)) print("RMSE:", round(rmse_rf, 2)) # In[22]:
class VanillaModelRegression(Model): def __init__(self, configuration): self._configuration = configuration self._objects = {} self._annotation = 'Performance comparision of different MVA discriminants' if 'annotation' in self._configuration: self._annotation = self._configuration['annotation'] self.my_model = None self.fit_results = None self.Initialize() @log_with() def Initialize(self): self.build_best_prediction() pass @log_with() def get(self, name): """ Factory method """ if name in self._objects: return self._objects[name] else: return None #provide factory method implementation here return self._objects[name] @log_with() def get_data_provider(self, provider_name): """ Factory method for data providers """ from dataprovider import PandasDataProviderFromCSV_original if provider_name in self._objects: return self._objects[provider_name] else: if '.csv' in self._configuration[provider_name]['input_file']: provider = PandasDataProviderFromCSV_original( self._configuration[provider_name]['input_file']) self._objects[provider_name] = provider else: raise NotImplementedError return self._objects[provider_name] @log_with() def build_best_prediction(self): print("Dummy building vanilla model!") from matplotlib import pyplot from xgboost import XGBRegressor, plot_importance # from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error target_variable_names = self._configuration['model']['target'][0] data_provider = self.get_data_provider( self._configuration['model']['data_provider']) input_features_names = self._configuration['model']['input_features'] X_train = data_provider.train[input_features_names] y_train = data_provider.train[target_variable_names] X_test = data_provider.test[input_features_names] y_test = data_provider.test[target_variable_names] # print X_train.dtypes # print X_train.head() # print X_test.dtypes # print X_test.head() # print y_train.dtypes # print y_train.head() # print y_test.dtypes # print y_test.head() eval_set = [(X_train, y_train), (X_test, y_test)] self.my_model = XGBRegressor( n_estimators=self._configuration['model']['n_estimators'], max_depth=self._configuration['model']['max_depth'], learning_rate=self._configuration['model']['learning_rate'], verbosity=0) self.my_model.fit(X_train, y_train, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=False) y_pred = my_model.predict(X_test) # print "Max error: ", max_error(y_test,y_pred) print("Explained variance score: ", explained_variance_score(y_test, y_pred)) print("Mean absolute error: ", mean_absolute_error(y_test, y_pred)) print("Mean squared error: ", mean_squared_error(y_test, y_pred)) self.fit_results = self.my_model.evals_result() # print 'YO importance' # plot_importance(my_model) pickle.dump( self.my_model, open(self._configuration['model']['output_filename'], 'wb')) pass
def xgb_train_and_predict(column_to_predict, train_data, evaluation_data, data_path): """ train a xgboost model on column_to_predict from train_data and generates predictions for evaluation_data which are stored in a column named `output` data_path specify path to data in order to compute external features """ logger.info("----------- check training data -------------") for resolution, dtf in train_data.groupby(['cantine_nom', 'cantine_type']): logger.info( "canteen %s has %s days of history to train on starting on %s and ending on %s", resolution, len(dtf), dtf["date_str"].min(), dtf['date_str'].max(), ) features = [ "site_id", # "date_str", # "cantine_nom", # "site_type_cat", "secteur_cat", # "year", # "month", # "day", "week", "wednesday", # this feature is only used if the dedicated parameter include_wednesday is set to True # "weekday", # weekday is not used here because redundant with meal composition "holidays_in", "non_working_in", "effectif", "frequentation_prevue", "Events.RAMADAN_ago", # "Events.AID_ago" ] with open(os.path.join(data_path, "calculators/menus.json")) as f_in: dict_special_dishes = json.load(f_in) features = features + list(dict_special_dishes.keys()) # prepare training dataset train_data_reduced = train_data[features + [column_to_predict]] before_dropping_na = len(train_data_reduced) train_data_reduced.dropna(inplace=True) after_dropping_na = len(train_data_reduced) percent_dropped = round(100 * (before_dropping_na - after_dropping_na) / before_dropping_na) logger.info("Dropping %s percent of training data due to NANs", percent_dropped) train_data_x = train_data_reduced[features] train_data_y = train_data_reduced[column_to_predict] if len(train_data_x) == 0: raise EmptyTrainingSet("") # prepare test_dataset to control overfitting train_data_x, train_data_y, test_data_x, test_data_y = ratio_split( train_data_x, train_data_y, 0.1) eval_set = [(train_data_x, train_data_y), (test_data_x, test_data_y)] # prepare prediction dataset evaluation_data_x = evaluation_data[features] params = { 'base_score': train_data_y.mean(), "objective": 'reg:squarederror', "n_estimators": 5000, "learning_rate": 0.09, "max_depth": 5, "booster": 'gbtree', "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 1, "gamma": 0, "importance_type": 'gain', "max_delta_step": 0, "min_child_weight": 1, "missing": None, "n_jobs": mp.cpu_count(), "nthread": None, "random_state": 0, "reg_alpha": 0, "reg_lambda": 1, "scale_pos_weight": 1, "seed": None, "subsample": 1, "verbosity": 0, } # define model model = XGBRegressor(**params) # train model model.fit(train_data_x, train_data_y, early_stopping_rounds=100, eval_set=eval_set, eval_metric=multi_custom_metrics, verbose=False) # predict values evaluation_data['output'] = np.ceil(model.predict(evaluation_data_x)) logger.info("----------- check predictions -------------") for resolution, dtf in evaluation_data.groupby( ['cantine_nom', 'cantine_type']): logger.info( "canteen %s has predictions for %s days starting on %s and ending on %s", resolution, len(dtf), dtf["date_str"].min(), dtf['date_str'].max(), ) logger.info("----------- evaluate model -------------") feature_importance_list = evaluate_feature_importance( evaluation_data_x, model) plot_curve(model.evals_result(), "nantes_metropole_xgb") return evaluation_data, feature_importance_list
from sklearn.metrics import r2_score boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66) xgb = XGBRegressor(n_estimators=10, learning_rate=0.1) xgb.fit(x_train, y_train, verbose=True, eval_metric=["rmse", "logloss"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = xgb.predict(x_test) r2 = r2_score(y_test, y_pre) score = xgb.score(x_test, y_test) result = xgb.evals_result() print(__file__) print(result) print("r2") print(r2) print("score") print(score)
# # train the model # print("[INFO] training model...") # model.fit(Xtrain, Ytrain, validation_data=(Xvalid, Yvalid), # epochs=10, batch_size=20) #dtrain=xgb.DMatrix(Xtrain,label=Ytrain) #dvalid=xgb.DMatrix(Xvalid,label=Yvalid) kf = KFold(n_splits=10, shuffle=True, random_state=seed) vali = cross_val_score(bst, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1) #print(bst.get_params()) print("####################Xgboost") trainbst = bst.fit(Xtrain, Ytrain, eval_set=[(Xtrain, Ytrain), (Xvalid, Yvalid)], eval_metric=['rmse', 'mae'], verbose=True) evres = bst.evals_result() # See MAE metric print(vali.mean()) plt.plot(list(evres['validation_0']['rmse'])) plt.plot(list(evres['validation_1']['rmse'])) plt.title('Model rmse') plt.ylabel('rmse') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') #plt.savefig("Keras_NN_Accuracy.png") plt.show() plt.clf() plt.plot(list(evres['validation_0']['mae'])) plt.plot(list(evres['validation_1']['mae'])) plt.title('Model mae')
def xgb_interval_train_and_predict(column_to_predict, train_data, evaluation_data, confidence_interval, data_path): """ train a xgboost model on column_to_predict from train_data and generates predictions for evaluation_data which are stored in a column named `output` data_path specify path to data in order to compute external features Note: here, the model does not directly learn from column to_predict but from the bound of a confidence_interval see here for more details: https://towardsdatascience.com/confidence-intervals-for-xgboost-cac2955a8fde """ features = [ "site_id", "secteur_cat", "week", "wednesday", # this feature is only used if the dedicated parameter include_wednesday is set to True "non_working_in", "holidays_in", "effectif", "frequentation_prevue", "Events.RAMADAN_ago", # "Events.AID_ago" ] logger.info("----------- check training data -------------") for resolution, dtf in train_data.groupby(['cantine_nom', 'cantine_type']): logger.info("canteen %s has %s days of history to train on starting on %s and ending on %s", resolution, len(dtf), dtf["date_str"].min(), dtf['date_str'].max(), ) with open(os.path.join(data_path, "calculators/menus.json")) as f_in: dict_special_dishes = json.load(f_in) features = features + list(dict_special_dishes.keys()) # prepare training dataset train_data_reduced = train_data[features + [column_to_predict]] before_dropping_na = len(train_data_reduced) train_data_reduced.dropna(inplace=True) after_dropping_na = len(train_data_reduced) percent_dropped = round(100 * (before_dropping_na - after_dropping_na) / before_dropping_na) logger.info("Dropping %s percent of training data due to NANs", percent_dropped) train_data_y = train_data_reduced[column_to_predict] train_data_x = train_data_reduced[features] if len(train_data_x) == 0: raise EmptyTrainingSet("") # prepare test_dataset to control overfitting train_data_x, train_data_y, test_data_x, test_data_y = ratio_split(train_data_x, train_data_y, 0.1) eval_set = [(train_data_x, train_data_y), (test_data_x, test_data_y)] # prepare prediction dataset evaluation_data_x = evaluation_data[features] params = { "n_jobs": mp.cpu_count(), 'base_score': train_data_y.mean(), "objective": 'reg:squarederror', "n_estimators": 5000, "learning_rate": 0.09, "max_depth": 5, "booster": 'gbtree', "importance_type": 'gain', "max_delta_step": 0, "min_child_weight": 1, "random_state": 0, "reg_alpha": 0, "reg_lambda": 1, "scale_pos_weight": 1, "subsample": 1, "verbosity": 0, } confidence_step = (1 - confidence_interval) / 2 # under predict params.update({"objective": log_cosh_quantile(1 - confidence_step)}) confidence_upper_bound_model = XGBRegressor(**params) confidence_upper_bound_model.fit( train_data_x, train_data_y, early_stopping_rounds=100, eval_set=eval_set, eval_metric=multi_custom_metrics, verbose=False) y_upper_smooth = np.ceil(confidence_upper_bound_model.predict(evaluation_data_x)) # over predict params.update({"objective": log_cosh_quantile(confidence_step)}) confidence_lower_bound_model = XGBRegressor(**params) confidence_lower_bound_model.fit(train_data_x, train_data_y, verbose=False) y_lower_smooth = np.ceil(confidence_lower_bound_model.predict(evaluation_data_x)) evaluation_data['pred_lower_bound'] = y_lower_smooth evaluation_data['pred_upper_bound'] = y_upper_smooth evaluation_data['output'] = np.maximum.reduce([y_upper_smooth, y_lower_smooth]) logger.info("----------- check predictions -------------") for resolution, dtf in evaluation_data.groupby(['cantine_nom', 'cantine_type']): logger.info("canteen %s has predictions for %s days starting on %s and ending on %s", resolution, len(dtf), dtf["date_str"].min(), dtf['date_str'].max(), ) logger.info("----------- evaluate model -------------") feature_importance_list = evaluate_feature_importance(evaluation_data_x, confidence_upper_bound_model) ## Generates errors on Windows with Reticulate plot_curve(confidence_upper_bound_model.evals_result(), "nantes_metropole_xgb") return evaluation_data, feature_importance_list
x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_estimators=100, learning_rate=0.1) # 나무의 갯수(n_estimators)는 epoch model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set =[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100) # eval_set은 validation_0이 x_train, y_train// validation1이 x_test, y_test # {'validation_0': {'rmse': [21.584942, 19.552324, 17.718475]} , 'validation_1': {'rmse': [21.684599, 19.621567, 17.763321]}} # train test val val지표가 중요 # rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구) results = model.evals_result() # XGB 에서 사용 print("eval's results : ", results) y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) # print("r2 Score : %.2f%%:" %(r2*100.0)) print("r2 : ", r2) import matplotlib.pyplot as plt epochs = len(results['validation_0']['logloss']) # epoch의 길이 x_axis = range(0, epochs) fig, ax = plt.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
return (X_train, y_train), (X_test, y_test) # Get training and test data target_colnames = ['Open_x', 'High_x', 'Low_x', 'Close_x'] Path('./Feature_Engineering').mkdir(parents=True, exist_ok=True) for colname in target_colnames: (X_train, y_train), (X_test, y_test) = get_feature_importance_data(data, column=colname, include_targets=False) regressor = XGBRegressor(gamma=0.0, n_estimators=200, learning_rate=0.05) xgbModel = regressor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False) eval_result = regressor.evals_result() training_rounds = range(len(eval_result['validation_0']['rmse'])) ######################### # Train Validation Plot # ######################### # plt.scatter(x=training_rounds, y=eval_result['validation_0']['rmse'], label='Training Error') # plt.scatter(x=training_rounds, y=eval_result['validation_1']['rmse'], label='Validation Error') # plt.xlabel('Iterations') # plt.ylabel('RMSE') # plt.title('Training Vs. Validation Error') # plt.legend() # plt.savefig(f'./Feature_Engineering/{colname}_train_val_history.png')