def model_intrv3(Y_train, X_train, Y_test, X_test, Targ): global reslts global metrs import pandas as pd import numpy as np import datetime as dt import sklearn from sklearn.metrics import mean_squared_error from xgboost.sklearn import XGBRegressor from sklearn.metrics import mean_squared_error model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=0, subsample=0.9, colsample_bytree=1.0, loss='ls').fit(X_train, Y_train) model.score(X_test, Y_test) pred_Yxgb = model.predict(X_test) mse = mean_squared_error(Y_test, pred_Yxgb) nRMSE = np.sqrt(mse) / Targ.mean() # nRMSE=np.sqrt(mse)/max(Targ) Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb} Yts_pd = pd.DataFrame(Yts_pd) print(mse, nRMSE) metrs = {'mse': mse, 'nRMSE': nRMSE} reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd} return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
def run_xgb(output_train, df_X_train, df_Y_train, output_test, df_X_test, df_Y_test): xgb_estimator = XGBRegressor() param_grid = { 'nthread': [4], #when use hyperthread, xgboost may become slower 'objective': ['reg:linear'], 'learning_rate': [.03, 0.05, .07], #so called `eta` value 'max_depth': [5, 6, 7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [30] } opt_pars = {"score": None, "alpha": None} xgb_grid = GridSearchCV(xgb_estimator, param_grid) xgb_grid.fit(df_X_train, df_Y_train.cnt) r2_train = xgb_grid.best_score_ opt_pars = xgb_grid.best_params_ # n_estimators = 30,max_features='log2',bootstrap=True, max_depth=None xgb_opt = XGBRegressor(random_state=1).set_params(**opt_pars) xgb_opt.fit(df_X_train, df_Y_train.cnt) r2_train = xgb_opt.score(df_X_train, df_Y_train.cnt) r2_test = xgb_opt.score(df_X_test, df_Y_test.cnt) result = df_proc.compare_results("XGBoost", xgb_opt, output_train, df_X_train, output_test, df_X_test) return { "r2": [r2_train, r2_test], "R2": [result[1], result[2]], "plot": result[0] }
def R2_estimator(_x_train, _x_test, _y_train, _y_test): train_score = [] test_score = [] for n in range(1, 100): reg = XGBRegressor(n_estimators=n, min_samples_split=7, max_depth=5) reg.fit(_x_train, _y_train) train_score.append(reg.score(_x_train, _y_train)) test_score.append(reg.score(_x_test, _y_test)) print(n, 'estimators done!') plt.plot(range(1, 100), train_score, color='skyblue', label='training') plt.plot(range(1, 100), test_score, color='red', label='testing') plt.legend() plt.xlabel('number of estimators') plt.ylabel('$R^2$ score') plt.show()
def xg_boost(_features, _x_train, _x_test, _y_train, _y_test, store=True, load=False, silent=False): if load: reg_xgb = joblib.load('XGB') else: reg_xgb = XGBRegressor(n_estimators=200, min_child_weight=7, max_depth=5, n_jobs=-1, silent=True) reg_xgb.fit(_x_train, _y_train) if store: joblib.dump(reg_xgb, 'GB') score = reg_xgb.score(_x_test, _y_test) print("\nXG Boosting:") print('Training Accuracy:\t', reg_xgb.score(_x_train, _y_train)) print('Testing Accuracy:\t', score) if not silent: print('\nImportance for each:') importance = [] for i in range(0, len(_features)): importance.append([_features[i], reg_xgb.feature_importances_[i]]) importance.sort(key=lambda x: x[1], reverse=True) for each in importance: print(each[0] + ':\t', each[1]) return reg_xgb, score
def XGBoostPredictor(X_train, y_train, X_test, y_test): #Fitting XGB regressor xboost = XGBRegressor(n_estimators=200) xboost.fit(X_train, y_train) xgb_score = xboost.score(X_test, y_test) xgb_score #Predict xboost_pred = xboost.predict(X_test) xgboostRMSE = sqrt(mean_squared_error(y_test, xboost_pred)) print("Root mean squared error: %.2f" % xgboostRMSE) print('R-squared fir XGBoost : %.2f' % r2_score(y_test, xboost_pred)) plt.scatter(y_test, xboost_pred) plt.xlabel('Measured') plt.ylabel('Predicted') plt.title('XGBoost Predicted vs Actual') plt.show() chart_regression(xboost_pred, y_test, 'XGBoost Predictor') return xgb_score, xgboostRMSE
def xgb(X_train, X_test, y_train, y_test): mod = XGBRegressor(learning_rate=0.2, objective='reg:squarederror') estimators = np.arange(1, 200, 10) scores = [] estim = [] for n in estimators: mod.set_params(n_estimators=n) mod.fit(X_train, y_train) scores.append(mod.score(X_test, y_test)) estim.append(n) xdf = pd.DataFrame({'Estimator':estim, 'Score':scores}) best = next((x for x in xdf['Estimator'][xdf['Score'] == max(xdf['Score'])]), None) xgbr = XGBRegressor(n_estimators=best, learning_rate=0.2, objective='reg:squarederror') xgbr.fit(X_train, y_train) return xgbr
datasets.target, train_size=0.8, random_state=104) #2 # model = GradientBoostingRegressor(max_depth=4) model = XGBRegressor(n_jobs=-1, use_label_encoder=False) #3 model.fit(x_train, y_train, eval_metric='mlogloss') #4 acc = model.score(x_test, y_test) print(model.feature_importances_) print('acc : ', acc) ''' def plot_feature_importances_dataset(model): n_features = datasets.data.shape[1] plt.barh(np.arange(n_features),model.feature_importances_, align='center') plt.yticks(np.arange(n_features),datasets.feature_names) plt.xlabel("Feature Importances") plt.ylim(-1, n_features) plot_feature_importances_dataset(model) '''
# # > XGBoost (Extreme Gradient Boosting) belongs to a family of boosting algorithms and uses the gradient boosting (GBM) framework at its core. It is an optimized distributed gradient boosting library. But wait, what is boosting? Well, keep on reading. # In[ ]: # Initialize model from xgboost.sklearn import XGBRegressor XGB_Regressor = XGBRegressor() # Fit the model on our data XGB_Regressor.fit(X_train, y_train) # In[ ]: # Score model XGB_Regressor.score(X_train, y_train) # <a id="76"></a> <br> # ## 7-6 LassoCV # Lasso linear model with iterative fitting along a regularization path. # The best model is selected by cross-validation. # In[ ]: lasso = LassoCV() # In[ ]: # Fit the model on our data lasso.fit(X_train, y_train)
} xg = XGBRegressor(random_state=96, objective='reg:linear') gridsearch = GridSearchCV(xg, param_grid=grid, cv=5) gridsearch.fit(X_train, y_train) print(gridsearch.best_score_) print(gridsearch.best_params_) xgb = XGBRegressor(random_state=96, objective='reg:linear', min_child_weight=6, n_estimators=1000, max_depth=7, colsample_bytree=0.6) xgb.fit(X_train, y_train) xgb_pred = xgb.predict(X_test) accuracy = xgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, xgb_pred) mean_squared_error(y_test, xgb_pred) np.sqrt(mean_squared_error(y_test, xgb_pred)) lgb = LGBMRegressor(objective='regression') lgb.fit(X_train, y_train) lgb_pred = lgb.predict(X_test) accuracy = lgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, lgb_pred) mean_squared_error(y_test, lgb_pred) np.sqrt(mean_squared_error(y_test, lgb_pred)) from yellowbrick.regressor import ResidualsPlot
import numpy as np import matplotlib.pyplot as plt import urllib.request as urllib2 from bs4 import BeautifulSoup import json df = pd.read_csv('dataset_cleaned.csv') df = df.loc[:, ~df.columns.str.contains('^Unnamed')] y = df['Value'] #df.shape X = df.drop(['Value'], axis=1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1234) from xgboost.sklearn import XGBRegressor xboost = XGBRegressor(n_estimators=200) xboost.fit(X_train, y_train) xgb_score = xboost.score(X_test, y_test) print(xgb_score)
def getTunedXGBoostModel(X_train, Y_train, X_test, Y_test): #tune number of estimators (decision trees) param_test = {'n_estimators': range(20, 101, 10)} gsearch = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) n_estimators = gsearch.best_params_['n_estimators'] #tune min_child_weight param_test = {'min_child_weight': range(1, 12, 1)} gsearch = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=n_estimators, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) min_child_weight = gsearch.best_params_['min_child_weight'] #tune max_depth param_test = {'max_depth': range(3, 12, 1)} gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) max_depth = gsearch.best_params_['max_depth'] #tune gamma param_test = {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]} gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) gamma = gsearch.best_params_['gamma'] #tune subsample param_test = {'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]} gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) subsample = gsearch.best_params_['subsample'] #tune colsample_bytree param_test = { 'colsample_bytree': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] } gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) colsample_bytree = gsearch.best_params_['colsample_bytree'] param_test = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]} gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) reg_alpha = gsearch.best_params_['reg_alpha'] param_test = {'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]} gsearch = GridSearchCV(estimator=XGBRegressor( learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, random_state=10), param_grid=param_test, n_jobs=4, iid=False, cv=5) gsearch.fit(X_train, Y_train.values.ravel()) reg_lambda = gsearch.best_params_['reg_lambda'] print("\n----------------------------------") print("Tuned XGBoost params:") print("n_estimators:", n_estimators) print("min_child_weight:", min_child_weight) print("max_depth:", max_depth) print("gamma:", gamma) print("subsample:", subsample) print("colsample_bytree:", colsample_bytree) print("reg_alpha", reg_alpha) print("reg_lambda", reg_lambda) print("----------------------------------\n") #proportionally decrease learning rate and increase # of estimators. Pick th bst combo model = XGBRegressor(learning_rate=0.01, n_estimators=n_estimators * 10, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=10) model.fit(X_train, Y_train.values.ravel()) score = model.score(X_test, Y_test) model2 = XGBRegressor(learning_rate=0.05, n_estimators=n_estimators * 2, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=10) model2.fit(X_train, Y_train.values.ravel()) score2 = model2.score(X_test, Y_test) if (score2 > score): score = score2 model = model2 model3 = XGBRegressor(learning_rate=0.1, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=10) model3.fit(X_train, Y_train.values.ravel()) score3 = model3.score(X_test, Y_test) if (score3 > score): score = score3 model = model3 model4 = XGBRegressor(learning_rate=0.2, n_estimators=int(n_estimators / 2), min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=10) model4.fit(X_train, Y_train.values.ravel()) score4 = model4.score(X_test, Y_test) if (score4 > score): model = model4 return model
text = "Accuracy: " + str(accuracy_dt) + "\nMean Squared Error: " + str(mse_dt) + "\nMean Absolute Error: " + str(mae_dt) output_file.write(text) scatter_plot(target_test, predicted_sales) plt.savefig('images/predict/dt.png') plt.close() ### XGB REGRESSOR ### xgbr = XGBRegressor(objective='reg:linear', nthread= 4, n_estimators= 500, max_depth= 6, learning_rate= 0.5) xb = xgbr.fit(other_train,target_train) predicted_sales = xgbr.predict(other_test) mae_xgbr = round(mean_absolute_error(target_test, predicted_sales),3) mse_xgbr = round(mean_squared_error(target_test, predicted_sales),3) accuracy_xgbr = round(xgbr.score(other_test, target_test),3) #write results to output file output_file.write("\n------------------------------------\n") output_file.write("XGB REGRESSOR STATISTICS:\n") output_file.write("------------------------------------\n") text = "Accuracy: " + str(accuracy_xgbr) + "\nMean Squared Error: " + str(mse_xgbr) + "\nMean Absolute Error: " + str(mae_xgbr) output_file.write(text) scatter_plot(target_test, predicted_sales) plt.savefig('images/predict/xgbr.png') plt.close() output_file.close()
xgb_reg.fit(X_train,y_train) # In[440]: y_pred = xgb_reg.predict(X_test) xgb_mse = mean_squared_error(y_test, y_pred) xgb_rmse = np.sqrt(forest_mse) xgb_rmse # In[442]: xgb_reg.score(X_test,y_test) # In[444]: # Offline i used CV=8 from sklearn.model_selection import cross_val_score scores = cross_val_score(xgb_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=2) rmse_scores = np.sqrt(-scores) display_scores(rmse_scores)
n_estimators=160, max_depth=6, min_child_weight=3, gamma=0, subsample=0.7, colsample_bytree=0.7, nthread=4, scale_pos_weight=1, seed=27) grid = GridSearchCV(estimator=xgb_best, param_grid=param_test, cv=5) grid.fit(source_X, source_y) grid.grid_scores_ grid.best_estimator_ xgb_best.fit(train_X, train_y) xgb_best.score(test_X, test_y) print(xgb_best.score(test_X, test_y)) xgb_param = xgb_best.get_xgb_params() xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=15, metrics=['auc'], early_stopping_rounds=50, stratified=True, seed=1301) full_xy = pd.concat([source_X, source_y], axis=1) target = 'count'
# Refit Xtreme Gradient Boosting xgb_tree = XGBRegressor(max_depth = 2, learning_rate = 0.10777777777777778, n_estimators = 300, min_child_weight = 5, colsample_bytree = 1, gamma = 0, reg_lambda = 1, reg_alpha = 0.30000000000000004, subsample= 2/3, random_state=0) xgb_tree.fit(x_train, y_train) # Training R sq: 95.65% print('Training Score:', xgb_tree.score(x_train, y_train)) # Training RMSE: .08271 print('Training RMSE:', math.sqrt(mean_squared_error(y_train, xgb_tree.predict(x_train)))) # Test R sq: 91.03% print('Test Score:', xgb_tree.score(x_test, y_test)) # Test RMSE: .12411 print('Test RMSE:', math.sqrt(mean_squared_error(y_test, xgb_tree.predict(x_test)))) ############################################################################## # Random Forest Test R sq: 89.25%, RMSE: 0.13589 rf_test_pred = rf_tree.predict(house_test_x) rf_test_pred = np.exp(rf_test_pred)
fit_params={"early_stopping_rounds":20, "eval_metric" : 'rmse', "eval_set" : [(X_val, y_val.reshape(-1))], 'verbose': 1, } model.fit(X_train, y_train.reshape(-1), **fit_params) """# Métrica $$ \textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2} $$ """ score = model.score(X_val, y_val) if log_output: y_pred_train = np.exp(model.predict(X_train)*max_log_y) y_pred = np.exp(model.predict(X_val)*max_log_y) y_pred_test = np.exp(model.predict(X_test)*max_log_y) else: y_pred_train = model.predict(X_train)*y_std + y_mean y_pred = model.predict(X_val)*y_std + y_mean y_pred_test = model.predict(X_test)*y_std + y_mean # Train train_RMSE = np.sqrt((((df_train['Sales'].values - y_pred_train)/df_train['Sales'].values)**2).sum()/len(y_pred_train)) # Validación val_RMSE = np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))
xbr.fit(x_train, y_train) predict = xbr.predict(x_test) # from xgboost import plot_importance # plot_importance(xbr) # plt.show() print('************************** XBR Evualtion ********************') x_test['predict'] = predict rmse = np.sqrt(mean_squared_error(y_test, predict)) print("RMSE", rmse) print("MAE", mean_absolute_error(y_test, predict)) print("r2_score", r2_score(y_test, predict)) print(xbr.score(x_train, y_train)) # xbr.plot_importance(model) # plt.rcParams['figure.figsize'] = [5, 5] # plt.show() # print("********************* XGB MODEL *******************") # print("best_ntree_limit",model.best_ntree_limit) # print("best_score",model.best_score) # print("best_iteration",model.best_iteration) # dtest = xgb.DMatrix(x_test, label=y_test, feature_names=list(x_test.columns)) # y_pred = model.predict(dtest) # x_test['pred'] = y_pred # # x_test['predict'] = predict # print(x_test.to_csv(os.path.dirname(__file__)+'/xtest_results.csv'))