Beispiel #1
0
def model_intrv3(Y_train, X_train, Y_test, X_test, Targ):
    global reslts
    global metrs
    import pandas as pd
    import numpy as np
    import datetime as dt
    import sklearn
    from sklearn.metrics import mean_squared_error
    from xgboost.sklearn import XGBRegressor
    from sklearn.metrics import mean_squared_error
    model = XGBRegressor(n_estimators=200,
                         learning_rate=0.05,
                         max_depth=4,
                         random_state=0,
                         subsample=0.9,
                         colsample_bytree=1.0,
                         loss='ls').fit(X_train, Y_train)
    model.score(X_test, Y_test)

    pred_Yxgb = model.predict(X_test)
    mse = mean_squared_error(Y_test, pred_Yxgb)
    nRMSE = np.sqrt(mse) / Targ.mean()
    # nRMSE=np.sqrt(mse)/max(Targ)
    Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb}
    Yts_pd = pd.DataFrame(Yts_pd)
    print(mse, nRMSE)
    metrs = {'mse': mse, 'nRMSE': nRMSE}
    reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd}
    return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
Beispiel #2
0
def run_xgb(output_train, df_X_train, df_Y_train, output_test, df_X_test,
            df_Y_test):
    xgb_estimator = XGBRegressor()
    param_grid = {
        'nthread': [4],  #when use hyperthread, xgboost may become slower
        'objective': ['reg:linear'],
        'learning_rate': [.03, 0.05, .07],  #so called `eta` value
        'max_depth': [5, 6, 7],
        'min_child_weight': [4],
        'silent': [1],
        'subsample': [0.7],
        'colsample_bytree': [0.7],
        'n_estimators': [30]
    }

    opt_pars = {"score": None, "alpha": None}
    xgb_grid = GridSearchCV(xgb_estimator, param_grid)
    xgb_grid.fit(df_X_train, df_Y_train.cnt)
    r2_train = xgb_grid.best_score_
    opt_pars = xgb_grid.best_params_
    # n_estimators = 30,max_features='log2',bootstrap=True,  max_depth=None
    xgb_opt = XGBRegressor(random_state=1).set_params(**opt_pars)
    xgb_opt.fit(df_X_train, df_Y_train.cnt)
    r2_train = xgb_opt.score(df_X_train, df_Y_train.cnt)
    r2_test = xgb_opt.score(df_X_test, df_Y_test.cnt)
    result = df_proc.compare_results("XGBoost", xgb_opt, output_train,
                                     df_X_train, output_test, df_X_test)
    return {
        "r2": [r2_train, r2_test],
        "R2": [result[1], result[2]],
        "plot": result[0]
    }
Beispiel #3
0
def R2_estimator(_x_train, _x_test, _y_train, _y_test):
    train_score = []
    test_score = []
    for n in range(1, 100):
        reg = XGBRegressor(n_estimators=n, min_samples_split=7, max_depth=5)
        reg.fit(_x_train, _y_train)
        train_score.append(reg.score(_x_train, _y_train))
        test_score.append(reg.score(_x_test, _y_test))
        print(n, 'estimators done!')
    plt.plot(range(1, 100), train_score, color='skyblue', label='training')
    plt.plot(range(1, 100), test_score, color='red', label='testing')
    plt.legend()
    plt.xlabel('number of estimators')
    plt.ylabel('$R^2$ score')
    plt.show()
Beispiel #4
0
def xg_boost(_features, _x_train, _x_test, _y_train, _y_test, store=True, load=False, silent=False):
    if load:
        reg_xgb = joblib.load('XGB')
    else:
        reg_xgb = XGBRegressor(n_estimators=200, min_child_weight=7, max_depth=5,
                               n_jobs=-1, silent=True)
        reg_xgb.fit(_x_train, _y_train)
        if store:
            joblib.dump(reg_xgb, 'GB')

    score = reg_xgb.score(_x_test, _y_test)
    print("\nXG Boosting:")
    print('Training Accuracy:\t', reg_xgb.score(_x_train, _y_train))
    print('Testing Accuracy:\t', score)
    if not silent:
        print('\nImportance for each:')
        importance = []
        for i in range(0, len(_features)):
            importance.append([_features[i], reg_xgb.feature_importances_[i]])
        importance.sort(key=lambda x: x[1], reverse=True)
        for each in importance:
            print(each[0] + ':\t', each[1])
    return reg_xgb, score
def XGBoostPredictor(X_train, y_train, X_test, y_test):
    #Fitting XGB regressor
    xboost = XGBRegressor(n_estimators=200)
    xboost.fit(X_train, y_train)
    xgb_score = xboost.score(X_test, y_test)
    xgb_score
    #Predict
    xboost_pred = xboost.predict(X_test)
    xgboostRMSE = sqrt(mean_squared_error(y_test, xboost_pred))
    print("Root mean squared error: %.2f" % xgboostRMSE)
    print('R-squared fir XGBoost : %.2f' % r2_score(y_test, xboost_pred))
    plt.scatter(y_test, xboost_pred)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    plt.title('XGBoost Predicted vs Actual')
    plt.show()
    chart_regression(xboost_pred, y_test, 'XGBoost Predictor')
    return xgb_score, xgboostRMSE
Beispiel #6
0
    def xgb(X_train, X_test, y_train, y_test):
        
        mod = XGBRegressor(learning_rate=0.2, objective='reg:squarederror')
        estimators = np.arange(1, 200, 10)
        scores = []
        estim = []

        for n in estimators:
            mod.set_params(n_estimators=n)
            mod.fit(X_train, y_train)
            scores.append(mod.score(X_test, y_test))
            estim.append(n)

        xdf = pd.DataFrame({'Estimator':estim, 'Score':scores})
        best = next((x for x in xdf['Estimator'][xdf['Score'] == max(xdf['Score'])]), None)

        xgbr = XGBRegressor(n_estimators=best, learning_rate=0.2, objective='reg:squarederror')
        xgbr.fit(X_train, y_train)
        
        return xgbr
                                                    datasets.target,
                                                    train_size=0.8,
                                                    random_state=104)

#2

# model  = GradientBoostingRegressor(max_depth=4)
model = XGBRegressor(n_jobs=-1, use_label_encoder=False)

#3

model.fit(x_train, y_train, eval_metric='mlogloss')

#4

acc = model.score(x_test, y_test)

print(model.feature_importances_)
print('acc : ', acc)
'''
def plot_feature_importances_dataset(model):
    n_features = datasets.data.shape[1]
    plt.barh(np.arange(n_features),model.feature_importances_,
            align='center')
    plt.yticks(np.arange(n_features),datasets.feature_names)
    plt.xlabel("Feature Importances")
    plt.ylim(-1, n_features)

plot_feature_importances_dataset(model)
'''
#
# > XGBoost (Extreme Gradient Boosting) belongs to a family of boosting algorithms and uses the gradient boosting (GBM) framework at its core. It is an optimized distributed gradient boosting library. But wait, what is boosting? Well, keep on reading.

# In[ ]:

# Initialize model
from xgboost.sklearn import XGBRegressor
XGB_Regressor = XGBRegressor()

# Fit the model on our data
XGB_Regressor.fit(X_train, y_train)

# In[ ]:

# Score model
XGB_Regressor.score(X_train, y_train)

# <a id="76"></a> <br>
# ## 7-6 LassoCV
# Lasso linear model with iterative fitting along a regularization path.
# The best model is selected by cross-validation.

# In[ ]:

lasso = LassoCV()

# In[ ]:

# Fit the model on our data
lasso.fit(X_train, y_train)
}
xg = XGBRegressor(random_state=96, objective='reg:linear')
gridsearch = GridSearchCV(xg, param_grid=grid, cv=5)
gridsearch.fit(X_train, y_train)
print(gridsearch.best_score_)
print(gridsearch.best_params_)

xgb = XGBRegressor(random_state=96,
                   objective='reg:linear',
                   min_child_weight=6,
                   n_estimators=1000,
                   max_depth=7,
                   colsample_bytree=0.6)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
accuracy = xgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, xgb_pred)
mean_squared_error(y_test, xgb_pred)
np.sqrt(mean_squared_error(y_test, xgb_pred))

lgb = LGBMRegressor(objective='regression')
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
accuracy = lgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, lgb_pred)
mean_squared_error(y_test, lgb_pred)
np.sqrt(mean_squared_error(y_test, lgb_pred))

from yellowbrick.regressor import ResidualsPlot
Beispiel #10
0
import numpy as np
import matplotlib.pyplot as plt
import urllib.request as urllib2
from bs4 import BeautifulSoup
import json

df = pd.read_csv('dataset_cleaned.csv')

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

y = df['Value']
#df.shape

X = df.drop(['Value'], axis=1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.3,
                                                    random_state=1234)

from xgboost.sklearn import XGBRegressor

xboost = XGBRegressor(n_estimators=200)

xboost.fit(X_train, y_train)

xgb_score = xboost.score(X_test, y_test)

print(xgb_score)
Beispiel #11
0
def getTunedXGBoostModel(X_train, Y_train, X_test, Y_test):
    #tune number of estimators (decision trees)
    param_test = {'n_estimators': range(20, 101, 10)}

    gsearch = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,
                                                  random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    n_estimators = gsearch.best_params_['n_estimators']

    #tune min_child_weight
    param_test = {'min_child_weight': range(1, 12, 1)}

    gsearch = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,
                                                  n_estimators=n_estimators,
                                                  random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    min_child_weight = gsearch.best_params_['min_child_weight']

    #tune max_depth
    param_test = {'max_depth': range(3, 12, 1)}

    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    max_depth = gsearch.best_params_['max_depth']

    #tune gamma
    param_test = {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]}
    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    gamma = gsearch.best_params_['gamma']

    #tune subsample
    param_test = {'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}
    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        gamma=gamma,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    subsample = gsearch.best_params_['subsample']

    #tune colsample_bytree
    param_test = {
        'colsample_bytree': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    }
    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        gamma=gamma,
        subsample=subsample,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    colsample_bytree = gsearch.best_params_['colsample_bytree']

    param_test = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}
    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        gamma=gamma,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    reg_alpha = gsearch.best_params_['reg_alpha']

    param_test = {'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]}
    gsearch = GridSearchCV(estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        gamma=gamma,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        random_state=10),
                           param_grid=param_test,
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(X_train, Y_train.values.ravel())
    reg_lambda = gsearch.best_params_['reg_lambda']

    print("\n----------------------------------")
    print("Tuned XGBoost params:")
    print("n_estimators:", n_estimators)
    print("min_child_weight:", min_child_weight)
    print("max_depth:", max_depth)
    print("gamma:", gamma)
    print("subsample:", subsample)
    print("colsample_bytree:", colsample_bytree)
    print("reg_alpha", reg_alpha)
    print("reg_lambda", reg_lambda)
    print("----------------------------------\n")

    #proportionally decrease learning rate and increase # of estimators. Pick th bst combo
    model = XGBRegressor(learning_rate=0.01,
                         n_estimators=n_estimators * 10,
                         min_child_weight=min_child_weight,
                         max_depth=max_depth,
                         gamma=gamma,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda,
                         random_state=10)
    model.fit(X_train, Y_train.values.ravel())
    score = model.score(X_test, Y_test)

    model2 = XGBRegressor(learning_rate=0.05,
                          n_estimators=n_estimators * 2,
                          min_child_weight=min_child_weight,
                          max_depth=max_depth,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          reg_lambda=reg_lambda,
                          random_state=10)
    model2.fit(X_train, Y_train.values.ravel())
    score2 = model2.score(X_test, Y_test)
    if (score2 > score):
        score = score2
        model = model2

    model3 = XGBRegressor(learning_rate=0.1,
                          n_estimators=n_estimators,
                          min_child_weight=min_child_weight,
                          max_depth=max_depth,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          reg_lambda=reg_lambda,
                          random_state=10)
    model3.fit(X_train, Y_train.values.ravel())
    score3 = model3.score(X_test, Y_test)
    if (score3 > score):
        score = score3
        model = model3

    model4 = XGBRegressor(learning_rate=0.2,
                          n_estimators=int(n_estimators / 2),
                          min_child_weight=min_child_weight,
                          max_depth=max_depth,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          reg_lambda=reg_lambda,
                          random_state=10)
    model4.fit(X_train, Y_train.values.ravel())
    score4 = model4.score(X_test, Y_test)
    if (score4 > score):
        model = model4

    return model
text = "Accuracy: " + str(accuracy_dt) + "\nMean Squared Error: " + str(mse_dt) + "\nMean Absolute Error: " + str(mae_dt)
output_file.write(text)

scatter_plot(target_test, predicted_sales)
plt.savefig('images/predict/dt.png')
plt.close()

###                              XGB REGRESSOR                                     ###
xgbr = XGBRegressor(objective='reg:linear', nthread= 4, n_estimators= 500, max_depth= 6, learning_rate= 0.5)
xb = xgbr.fit(other_train,target_train)
predicted_sales = xgbr.predict(other_test)

mae_xgbr = round(mean_absolute_error(target_test, predicted_sales),3)
mse_xgbr = round(mean_squared_error(target_test, predicted_sales),3)
accuracy_xgbr = round(xgbr.score(other_test, target_test),3)

#write results to output file
output_file.write("\n------------------------------------\n")
output_file.write("XGB REGRESSOR STATISTICS:\n")
output_file.write("------------------------------------\n")

text = "Accuracy: " + str(accuracy_xgbr) + "\nMean Squared Error: " + str(mse_xgbr) + "\nMean Absolute Error: " + str(mae_xgbr)
output_file.write(text)

scatter_plot(target_test, predicted_sales)
plt.savefig('images/predict/xgbr.png')
plt.close()

output_file.close()
xgb_reg.fit(X_train,y_train)


# In[440]:


y_pred = xgb_reg.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_pred)
xgb_rmse = np.sqrt(forest_mse)
xgb_rmse


# In[442]:


xgb_reg.score(X_test,y_test)


# In[444]:


# Offline i used CV=8

from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_reg, X_test, y_test,
                         scoring="neg_mean_squared_error", cv=2)
rmse_scores = np.sqrt(-scores)

display_scores(rmse_scores)
Beispiel #14
0
                        n_estimators=160,
                        max_depth=6,
                        min_child_weight=3,
                        gamma=0,
                        subsample=0.7,
                        colsample_bytree=0.7,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)
grid = GridSearchCV(estimator=xgb_best, param_grid=param_test, cv=5)
grid.fit(source_X, source_y)
grid.grid_scores_
grid.best_estimator_

xgb_best.fit(train_X, train_y)
xgb_best.score(test_X, test_y)
print(xgb_best.score(test_X, test_y))

xgb_param = xgb_best.get_xgb_params()
xgb.cv(xgb_param,
       xgtrain,
       num_boost_round=5000,
       nfold=15,
       metrics=['auc'],
       early_stopping_rounds=50,
       stratified=True,
       seed=1301)

full_xy = pd.concat([source_X, source_y], axis=1)
target = 'count'
# Refit Xtreme Gradient Boosting
xgb_tree = XGBRegressor(max_depth = 2, 
                        learning_rate = 0.10777777777777778, 
                        n_estimators = 300,
                        min_child_weight = 5,
                        colsample_bytree = 1,
                        gamma = 0,
                        reg_lambda = 1,
                        reg_alpha = 0.30000000000000004,
                        subsample= 2/3, 
                        random_state=0)

xgb_tree.fit(x_train, y_train)

# Training R sq: 95.65%
print('Training Score:', xgb_tree.score(x_train, y_train))
# Training RMSE: .08271
print('Training RMSE:', math.sqrt(mean_squared_error(y_train, xgb_tree.predict(x_train))))

# Test R sq: 91.03%
print('Test Score:', xgb_tree.score(x_test, y_test))
# Test RMSE: .12411
print('Test RMSE:', math.sqrt(mean_squared_error(y_test, xgb_tree.predict(x_test))))



##############################################################################

# Random Forest Test R sq: 89.25%, RMSE: 0.13589
rf_test_pred = rf_tree.predict(house_test_x)
rf_test_pred = np.exp(rf_test_pred)
Beispiel #16
0
fit_params={"early_stopping_rounds":20, 
            "eval_metric" : 'rmse', 
            "eval_set" : [(X_val, y_val.reshape(-1))],
            'verbose': 1,
           }

model.fit(X_train, y_train.reshape(-1), **fit_params)

"""# Métrica

$$
\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}
$$
"""

score = model.score(X_val, y_val)

if log_output:
    y_pred_train = np.exp(model.predict(X_train)*max_log_y)
    y_pred = np.exp(model.predict(X_val)*max_log_y)
    y_pred_test = np.exp(model.predict(X_test)*max_log_y)
else:
    y_pred_train = model.predict(X_train)*y_std + y_mean
    y_pred = model.predict(X_val)*y_std + y_mean
    y_pred_test = model.predict(X_test)*y_std + y_mean

# Train
train_RMSE = np.sqrt((((df_train['Sales'].values - y_pred_train)/df_train['Sales'].values)**2).sum()/len(y_pred_train))

# Validación
val_RMSE = np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))
xbr.fit(x_train, y_train)

predict = xbr.predict(x_test)

# from xgboost import plot_importance
# plot_importance(xbr)
# plt.show()

print('************************** XBR Evualtion ********************')
x_test['predict'] = predict
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE", rmse)
print("MAE", mean_absolute_error(y_test, predict))
print("r2_score", r2_score(y_test, predict))
print(xbr.score(x_train, y_train))

# xbr.plot_importance(model)
# plt.rcParams['figure.figsize'] = [5, 5]
# plt.show()

# print("********************* XGB MODEL *******************")
# print("best_ntree_limit",model.best_ntree_limit)
# print("best_score",model.best_score)
# print("best_iteration",model.best_iteration)

# dtest = xgb.DMatrix(x_test, label=y_test, feature_names=list(x_test.columns))
# y_pred = model.predict(dtest)
# x_test['pred'] = y_pred
# # x_test['predict'] = predict
# print(x_test.to_csv(os.path.dirname(__file__)+'/xtest_results.csv'))