def __fit_regressors(self):
        print('Fitting regressors...')
        for method in self.cluster_methods:
            clusters = self.__get_cluster_labels(method)
            model = self.models[method]
            for label in clusters:
                for regressor in self.regressors:
                    # Training separate knn for each cluster
                    if regressor == 'knn':
                        model[label]['knn'] = {}
                        model[label]['knn']['model'] = KNeighborsRegressor(
                            n_neighbors=5, weights='distance')
                        model[label]['knn']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'lr':
                        model[label]['lr'] = {}
                        model[label]['lr']['model'] = LinearRegression(
                            normalize=True)
                        model[label]['lr']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'adaboost':
                        model[label]['adaboost'] = {}
                        model[label]['adaboost']['model'] = AdaBoostRegressor(
                            n_estimators=100,
                            learning_rate=0.2,
                            loss='exponential')
                        model[label]['adaboost']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'gradientboosting':
                        model[label]['gradientboosting'] = {}
                        model[label]['gradientboosting'][
                            'model'] = GradientBoostingRegressor(
                                n_estimators=400,
                                learning_rate=0.1,
                                loss='ls',
                                max_depth=5,
                                min_samples_split=2)
                        model[label]['gradientboosting']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'randomforest':
                        model[label]['randomforest'] = {}
                        model[label]['randomforest'][
                            'model'] = RandomForestRegressor(n_estimators=400)
                        model[label]['randomforest']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'decisiontree':
                        model[label]['decisiontree'] = {}
                        model[label]['decisiontree'][
                            'model'] = DecisionTreeRegressor()
                        model[label]['decisiontree']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'xgboost':
                        model[label]['xgboost'] = {}
                        model[label]['xgboost'][
                            'model'] = xgboost.XGBRegressor(n_estimators=900,
                                                            learning_rate=0.05,
                                                            max_depth=5)
                        model[label]['xgboost']['model'].fit(
                            model[label]['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'pr2':
                        # Need to create and fit poly features in same function
                        model[label]['pr2']['model'] = LinearRegression(
                            normalize=True)
                        model[label]['pr2']['model'].fit(
                            model[label]['pr2']['X_train'],
                            model[label]['Y_train']['price'])

                    elif regressor == 'pr3':
                        # Need to create and fit poly features in same function
                        model[label]['pr3']['model'] = LinearRegression(
                            normalize=True)
                        model[label]['pr3']['model'].fit(
                            model[label]['pr3']['X_train'],
                            model[label]['Y_train']['price'])
Exemple #2
0
def xgboost(booster = 'gblinear', use_log = True, scale = True, use_dum = False):
    '''
    runs a xgboost.
    --------------------------------
    parameters:
    - booster: ['gblinear', 'gbtree', 'dart'] is the booster of xgboost.
    - use_log: if True, the target will be log(SalePrice) of houses.
    - scale: if True, features will be standardized.
    - use_dum: if True, categorical features will be dummified, otherwise they will be equal to the corresponding mean target for each level of the feature.
    '''
    # preparing the data
    if use_dum:
        data = pd.read_csv('../derivedData/train_cleaned.csv', index_col='Id')
    else:
        data = pd.read_csv('../derivedData/train_NotDum.csv', index_col='Id')

    data['logSalePrice'] = np.log(data['SalePrice'])

    if not use_dum:
        cols_to_enc = data.columns[data.dtypes == 'object']
        for col in cols_to_enc:
            if use_log:
                gp = data.groupby(col)['logSalePrice'].mean()
            else:
                gp = data.groupby(col)['SalePrice'].mean()
            data[col] = data[col].apply(lambda x: gp[x])

    X = data.drop(['SalePrice', 'logSalePrice'], axis=1)
    if not use_log:
        y = data['SalePrice']
    else:
        y = data['logSalePrice']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if scale:
        ss = StandardScaler()
        ss.fit(X_train)
        X_train = pd.DataFrame(ss.transform(X_train))
        X_test = pd.DataFrame(ss.transform(X_test))

    if booster == 'gblinear':
        xgb_param = {
            'alpha': [0],
            'lambda': np.linspace(0, .2, 200)
        }
    elif booster == 'gbtree':
        xgb_param = {
            'max_depth': [2, 3],
            'min_child_weight': np.linspace(5, 15, 20),
            'lambda': np.linspace(1, 10, 20),
            'alpha': [0]
        }
    elif booster == 'dart':
        xgb_param = {
            'max_depth': [2],
            'min_child_weight': np.linspace(10, 15, 6),
            'lambda': np.linspace(0, 2, 4),
            'alpha': [0],
            'sample_type': ['uniform'],
            'normalize_type': ['tree'],
            'rate_drop': np.linspace(.5, 1, 8),
            'skip_drop': np.linspace(.5, 1, 10)
        }

    xgboost = xgb.XGBRegressor(booster=booster)

    grid_search_xgb = GridSearchCV(xgboost, xgb_param, cv=4)

    grid_search_xgb.fit(X_train, y_train)

    return grid_search_xgb
Exemple #3
0
# In[42]:

y_pred = log_model.predict(val_features)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
    log_model.score(val_features, val_labels)))
print(classification_report(val_labels, y_pred))
#Achieved a score of 0.756 on kaggle, let's try XGBoost Model now!

# In[43]:

#Building XGBoost model

xg_reg = xgb.XGBRegressor(objective='binary:logistic',
                          colsample_bytree=0.2,
                          learning_rate=0.1,
                          max_depth=7,
                          alpha=10,
                          n_estimators=20,
                          scale_pos_weight=20)
xg_reg.fit(train_features, train_labels)

# In[44]:

preds = xg_reg.predict(val_features)

# In[45]:

preds = np.where(preds > 0.6, 1, 0)
print(classification_report(val_labels, preds))

# In[46]:
                                                      test_size=0.3,
                                                      random_state=0)
svc_model_poly = svm.SVC(kernel='poly', degree=5)
svc_model_poly.fit(X1_train, y_train)
predictions_poly = svc_model_poly.predict(X1_test)
print(
    "nbr of features: ", 2200, " PCA accuracy with POLY SVM " +
    str(100 * accuracy_score(y_test, predictions_poly)) + '%')

#7.2 Random Forest

rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(X1_train, y_train)
y_pred = rf.predict(X1_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred.round()) * 100)
classification_report(y_test, y_pred.round())
#7.3 XGBOOST

data_dmatrix = xg.DMatrix(data=X1, label=y)
xg_reg = xg.XGBRegressor(objective='binary:logistic',
                         colsample_bytree=0.3,
                         learning_rate=0.1,
                         max_depth=20,
                         alpha=150,
                         n_estimators=1000)
xg_reg.fit(X1_train, y_train)
preds = xg_reg.predict(X1_test)
print("Accuracy:", metrics.accuracy_score(y_test, preds.round()) * 100)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
                                                    y_wind,
                                                    test_size=0.3,
                                                    random_state=42)

#data transformation (scaling)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#creation of regressor model

xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             colsample_bytree=0.3,
                             learning_rate=0.1,
                             max_depth=5,
                             alpha=10,
                             n_estimators=10)

#fitting model
xgb_model.fit(X_train, y_train)  # fit model

#predicting
y_predicted_w = xgb_model.predict(X_test)

#accuracy determination of random forest regression
rmse = np.sqrt(mean_squared_error(y_test, y_predicted_w))

pickle.dump(xgb_model, open('model_w.pkl', 'wb'))
x_train = df_churn[var_select].apply(pd.to_numeric, errors='coerce')
y_train = df_churn['churn_flag']
x_train, x_eval_xgb, y_train, y_eval_xgb = train_test_split(x_train,
                                                            y_train,
                                                            test_size=0.2,
                                                            random_state=42)
# Split training data again into training and test set
x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                    y_train,
                                                    test_size=0.2,
                                                    random_state=42)

# Specify the kind of model to develop
xgb_reg = xgb.XGBRegressor(objective='binary:logistic',
                           colsample_bytree=0.3,
                           learning_rate=0.1,
                           max_depth=5,
                           n_estimators=350)

xgb_reg.fit(x_train, y_train)
preds = xgb_reg.predict(x_test)

# Check accuracy of predictions
accuracy_xgb = confusion_matrix(y_test.values, np.round(preds))
print(sum(np.diagonal(accuracy_xgb)) / sum(sum(accuracy_xgb)))
print(accuracy_xgb)

# Obtain the most important variables
scores_xgb = xgb_reg.get_booster().get_score(importance_type='gain')
most_imp_xgb = pd.DataFrame({
    'feature': list(scores_xgb.keys()),
Exemple #7
0
 def xgb_reg(self, para):
     reg = xgb.XGBRegressor(**para['reg_params'])
     return self.train_reg(reg, para)
Exemple #8
0
X_test = all_data[train.shape[1]:]
y = train.SalePrice

# Import XGBoost

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y)
dtest = xgb.DMatrix(X_test)

params = {"max_depth": 2, "eta": 0.1}
model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100)

model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot()

model_xgb = xgb.XGBRegressor(
    n_estimators=360, max_depth=2,
    learning_rate=0.1)  #the params were tuned using xgb.cv
model_xgb.fit(X_train, y)

xgb_preds = np.expm1(model_xgb.predict(X_test))
lasso_preds = np.expm1(model_lasso.predict(X_test))

predictions = pd.DataFrame({"xgb": xgb_preds, "lasso": lasso_preds})
predictions.plot(x="xgb", y="lasso", kind="scatter")

preds = 0.7 * lasso_preds + 0.3 * xgb_preds

solution = pd.DataFrame({"id": test.Id, "SalePrice": preds})
solution_csv = solution.to_csv("ridge_sol.csv", index=False)
Exemple #9
0
 def fit(self, train_data):
     # 定义基本模型
     self.base_models_ = [list() for x in self.base_models]
     # 定义元模型
     self.meta_model_ = clone(self.meta_model)
     
     
     shape_ = [train_data[train_data[self.fe_] == d].shape[0] for d in self._slip]
     y_true = np.array([])
     for d in self._slip:
         y_true = np.hstack((y_true, train_data.loc[train_data.day == d, self.target_].values))
     
     index = []
     for k, sh in enumerate(shape_):
         if k == 0:
             index.append(list(range(sh)))
         else:
             index.append(list(range(index[-1][-1], index[-1][-1]+shape_[k])))
     
     # 设置用于元模型的特征大小
     oof_pred = np.zeros((sum(shape_), 
                          len(self.base_models)))
     # 训练基础模型
     for i, model_name in enumerate(self.base_models):
         for j, date in enumerate(self._slip):
             # 设置训练集和验证集
             train = train_data[train_data[self.fe_]<date]
             valid = train_data[train_data[self.fe_]==date]
             
             X_train = train[self.features].values
             X_eval = valid[self.features].values
             y_train = train[self.target_].values
             y_eval = valid[self.target_].values
             
             if model_name =='lgb':
                 lgb_train = lgb.Dataset(X_train, y_train)
                 lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)
                 print("开始训练{}_{}".format(i, j))
                 model = lgb.train(lgb_params, 
                                      lgb_train,
                                      num_boost_round=10000,
                                      valid_sets=[lgb_train, lgb_eval],
                                      valid_names=['train', 'valid'],
                                      early_stopping_rounds=200,
                                      verbose_eval=1000,)
                 y_pred = model.predict(X_eval)
                 print("结束本次训练!")
             if model_name == 'cat':
                 cat_train = Pool(X_train, y_train)
                 cat_eval = Pool(X_eval, y_eval)
                 print("开始训练{}_{}".format(i, j))
                 model = catboost.train(
                             pool = cat_train, params=cat_params,
                             eval_set=cat_eval, num_boost_round=50000,
                             verbose_eval=5000, early_stopping_rounds=200,)
                 y_pred = model.predict(X_eval)
                 print("结束本次训练!")
                 
             if model_name == 'xgb':
                 
                 print("开始训练{}_{}".format(i, j))
                 model = xgb.XGBRegressor(**xgb_params)
                 #print(X_train.shape)
                 model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], early_stopping_rounds=400, verbose=1000)
                 y_pred = model.predict(X_eval)
                 print("结束本次训练!")
                 
             self.base_models_[i].append(model)
             oof_pred[index[j], i] = y_pred
     
     self.meta_model_.fit(oof_pred, y_true)
     return self
Exemple #10
0
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

import xgboost as xgb
regressor = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)


regressor.fit(x_train, y_train)
y_pred_xgb = regressor.predict(x_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_KNN = confusion_matrix(y_test, y_pred_xgb)
#final_train_matrix = train.drop(['user_id', 'day_of_week', 'record_date'], axis=1).as_matrix()
final_train_matrix = np.row_stack(
    (train1_matrix, train2_matrix, train3_matrix, train4_matrix, train5_matrix,
     train6_matrix, train7_matrix, train8_matrix, train9_matrix,
     train10_matrix))
train_X = final_train_matrix[:, :-1]
train_Y = final_train_matrix[:, -1]

print "make test datset"
final_test_matrix = final_test.drop(['user_id', 'day_of_week', 'record_date'],
                                    axis=1).as_matrix()
test_matrix_X = final_test_matrix[:, :-1]
test_matrix_Y = final_test_matrix[:, -1]

print("hyper-parameter optimization...................")
xgb_model = xgb.XGBRegressor()
params = {
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [50, 100, 150, 200],
    'max_delta_step': [1],
    'objective': [
        'reg:linear',
        'reg:gamma',
        'reg:tweedie',
    ]
}
# , 'colsample_bytree':[1], 'colsample_bylevel':[1], 'reg_alpha':[0], 'reg_lambda':[1], 'scale_pos_weight':[1], 'base_score':[0.5], 'seed':[0], 'missing':[None],'nthread':[-1], 'gamma':[0], 'min_child_weight':[1], , 'subsample':[0.5,0.8,1]
gridsearchcvRegression = GridSearchCV(xgb_model,
                                      params,
                                      iid=True,
y2 = data.iloc[:, 9:10]

X1_train, X1_test, y1_train, y1_test = train_test_split(X,
                                                        y1,
                                                        test_size=0.3,
                                                        train_size=0.7,
                                                        random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X,
                                                        y2,
                                                        test_size=0.3,
                                                        train_size=0.7,
                                                        random_state=0)

#y1 output

regr_1 = xgboost.XGBRegressor(n_estimators=100, max_depth=5)
regr_1.fit(X1_train, y1_train.values.ravel())

#predict

y1_test_pred = regr_1.predict(X1_test)
y1_train_pred = regr_1.predict(X1_train)

error_y1_test = (
    (abs(y1_test['Y1'].values - y1_test_pred)) * 100) / y1_test['Y1'].values
error_y1_train = (
    (abs(y1_train['Y1'].values - y1_train_pred)) * 100) / y1_train['Y1'].values

error_y1_test_mean = np.mean(error_y1_test)
error_y1_train_mean = np.mean(error_y1_train)
Exemple #13
0
def build_xgb_regr(features, labels):
    return xgb.XGBRegressor().fit(features, labels)
Exemple #14
0
                print("Minimum validation MSE:", min_val_error)  # 0.002712853325235463 is the same as the model above
                break  # early stopping

    # XGBoost
    # not shown in the book

    if False:  # cannot run this code for dll file problem.
        try:
            import xgboost
            print('importing XGBoost')
        except ImportError as ex:
            print("Error: the xgboost library is not installed.")
            xgboost = None

        if xgboost is not None:
            xgb_reg = xgboost.XGBRegressor(random_state=42)
            xgb_reg.fit(X_train, y_train)
            y_pred = xgb_reg.predict(X_val)
            val_error = mean_squared_error(y_val, y_pred)
            print("Validation MSE:", val_error)

        if xgboost is not None:  # not shown in the book
            xgb_reg.fit(X_train, y_train,
                        eval_set=[(X_val, y_val)], early_stopping_rounds=2)
            y_pred = xgb_reg.predict(X_val)
            val_error = mean_squared_error(y_val, y_pred)
            print("Validation MSE:", val_error)

    # Stacking

    # data set
X_train,X_eval,Y_train,Y_eval=train_test_split(X,Y,test_size=.1,
                                               random_state=5)
X_test=df_test.loc[:,feature_names].values

X_train=pd.DataFrame(X_train,columns=feature_names)
X_eval=pd.DataFrame(X_eval,columns=feature_names)
X_test=pd.DataFrame(X_test,columns=feature_names)
#===feature extraction===

date=str(pd.to_datetime(ctime()).date())

    
bst=xgb.XGBRegressor(max_depth=10,booster='gbtree', 
                      learning_rate=.1,n_estimators=5000,
                      subsample=.9,  
                      colsample_bytree=.9, reg_lambda=10, 
                      silent=False)    

if modelFitFlg==1:
    print(ctime()+'...training model...')
    bst.fit(X=X_train,y=Y_train,
            eval_set=[(X_eval,Y_eval)],
            eval_metric=['rmse'],early_stopping_rounds=10)
    joblib.dump(bst,join(fittedModelDir,
                         'model6_nonCV_{}{}'.format(date,'.pkl')))



#===make prediction for test set==
fittedMdlPath='/home/arash/MEGA/MEGAsync/Machine Learning/'+\
    'n_estimators': [1000],
    'early_stopping_rounds': [10],
    'booster': ['gbtree'],
    'verbosity': [1],
    'subsample': list(np.linspace(0.25, 1, 4)),
    #'learning_rate':[0.0001,0.001,0.01,0.1],
    #'eval_set': [[(X_test,y_test)]],
    'gamma': [0, 0.0001, 0.001, 0.01, 0.1],
    'eval_metric': ['rmse'],
    'verbose': [True],
    'silent': [False],
    'min_child_weight': list(np.arange(1, X_test.shape[1], 5)),
    'n_estimators': [10, 100, 200, 300, 1000]
}]

xg_reg = xgb.XGBRegressor()

# Applying Grid Search to find the best model and the best parameters
from hypopt import GridSearch
from sklearn.model_selection import GridSearchCV
grid_search = GridSearch(model=xg_reg, param_grid=parameters)

grid_search = grid_search.fit(X_train,
                              y_train,
                              X_val=X_test,
                              y_val=y_test,
                              scoring='neg_mean_squared_error')
best_parameters = grid_search.get_params()

#best_mse = (-grid_search.best_score_)**(1/2)
#best_parameters = grid_search.best_params_
Exemple #17
0
def xgb_reg(n_estimators=100,
            max_depth=6,
            learning_rate=0.05,
            k=5,
            train_data_path='../data/training_data.csv',
            save_model=False,
            tracking_uri="http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path=train_data_path)
    (variable_names, X_train, X_test, y_train,
     y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                               ('regression',
                                xgb.XGBRegressor(objective="reg:squarederror",
                                                 seed=RANDOM_SEED))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__n_estimators'] = np.arange(
        n_estimators[0], n_estimators[1], n_estimators[2])
    hyperparams['regression__max_depth'] = np.arange(max_depth[0],
                                                     max_depth[1],
                                                     max_depth[2])
    hyperparams['regression__learning_rate'] = learning_rate

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator=pipeline,
                           param_grid=hyperparams,
                           cv=k,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i]  # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    best_params = modelCV.best_params_
    print(f"\nBest parameter set found for the training set:\n{best_params}")

    # Store the index of the best combination
    best_index = param_list.index(best_params)

    # Get the best values for hyperparams
    best_n_estimators = best_params['regression__n_estimators']
    best_max_depth = best_params['regression__max_depth']
    best_learning_rate = best_params['regression__learning_rate']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is C
    criteria = 'n_estimators'
    mlflow.set_tag("criteria", criteria)
    param_values = hyperparams['regression__n_estimators']

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    xgb.XGBRegressor(objective="reg:squarederror",
                                     n_estimators=param_value,
                                     max_depth=best_max_depth,
                                     learning_rate=best_learning_rate))])

        param = {
            'regression__n_estimators': param_value,
            'regression__max_depth': best_max_depth,
            'regression__learning_rate': best_learning_rate
        }

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae,
         r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria,
                training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print(
            "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..."
        )
        final_model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    xgb.XGBRegressor(objective="reg:squarederror",
                                     n_estimators=best_n_estimators,
                                     max_depth=best_max_depth,
                                     learning_rate=best_learning_rate))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Plot importances and final tree
        ax = xgb.plot_importance(final_model.named_steps['regression'])
        fig = ax.figure
        fig.savefig('./img/importances.png', bbox_inches='tight')
        plt.close(fig)

        ax = xgb.plot_tree(final_model.named_steps['regression'], rankdir='LR')
        fig = ax.figure
        fig.set_size_inches(30, 15)
        fig.savefig('./img/tree.png', dpi=400, bbox_inches='tight')
        plt.close(fig)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
def return_weights_from_xgboost(
    geodataframe,
    raster_path,
    pop_string,
    codes=[21, 22, 23, 24],
    n_pixels_option_values=256,
    tuned_xgb=False,
    gbm_hyperparam_grid={
        "learning_rate": [0.001, 0.01, 0.1],
        "n_estimators": [200],
        "subsample": [0.3, 0.5],
        "max_depth": [4, 5, 6],
        "num_boosting_rounds": [10, 20],
    },
    force_crs_match=True,
    na_value=255,
    ReLU=True,
):
    """Function that returns the weights of each land type according to NLCD
    types/codes given by Extreme Gradient Boost model (XGBoost)

    Parameters
    ----------

    geodataframe           : a geopandas geoDataFrame used to build regression

    raster_path            : the path to the associated raster image.

    pop_string             : the name of the variable on geodataframe that the regression shall be conducted

    codes                  : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
                             The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
                             The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).

    n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.

    tuned_xgb              : bool. Default is False.
                             If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation.
                             Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library.

    gbm_hyperparam_grid    : a dictionary that represent the grid for the grid search of XGBoost.

    force_crs_match        : bool. Default is True.
                             Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                             It is recommended to let this argument as True.

    na_value               : int. Default is 255.
                             The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.

    ReLU                   : bool. Default is True.
                             Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.

    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    3) The returning weights represent the average of the Shapley's values from each feature.
    """
    try:
        import xgboost as xgb
        import shap
    except ImportError as e:
        raise ImportError("xgboost and shap are required to perform this.")

    _check_presence_of_crs(geodataframe)

    if na_value in codes:
        raise ValueError("codes should not assume the na_value value.")

    profiled_df = fast_append_profile_in_gdf(
        geodataframe[["geometry", pop_string]], raster_path, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    str_codes = [str(i) for i in codes]
    feature_names = ["Type_" + s for s in str_codes]

    y = profiled_df[pop_string]
    X = profiled_df[feature_names]

    if tuned_xgb == False:

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Create the parameter dictionary
        params = {"objective": "reg:linear"}

        # Train the model
        xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix)

    if tuned_xgb == True:

        try:
            from sklearn.model_selection import GridSearchCV
        except ImportError as e:
            raise ImportError("sklearn is required to perform this.")

        gbm = xgb.XGBRegressor()
        grid_mse = GridSearchCV(
            estimator=gbm,
            param_grid=gbm_hyperparam_grid,
            scoring="neg_mean_squared_error",
            cv=4,  # 4-fold crossvalidation
            verbose=3,  # Prints the grid search profile
            n_jobs=-1,
        )  # Process the GridSearch in parallel all cores availables

        # Fit the grid to the data
        grid_mse.fit(X, y)

        best_params = grid_mse.best_params_
        best_params["objective"] = "reg:linear"

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Train the model from the best parameters of the grid search
        xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix)

    # Build explainer and fit Shapley's values (https://github.com/slundberg/shap)
    explainer = shap.TreeExplainer(xg_reg, feature_dependence="independent")
    shap_values = explainer.shap_values(X)
    weights_from_xgb = shap_values.mean(
        axis=0)  # This is already sorted by pixel Type

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = list(weights_from_xgb)  # Convert to list a dict_values

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    return weights
Exemple #19
0
from ay_hw_4._global import ROOT_PATH, CRIME
from ay_hw_4.util_data import load_data, train_test_split_by_size

if __name__ == "__main__":
    warnings.simplefilter(action='ignore', category=FutureWarning)
    X_data, y_data = load_data(ROOT_PATH + CRIME,
                               skip_first_column=5,
                               y_column_index=-1,
                               needImpute=True)
    X_train, X_test, y_train, y_test = train_test_split_by_size(
        X_data, y_data, train_size=1495, random_state=2333)
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_test = xgb.DMatrix(X_test, label=y_test)

    xgb_clf = xgb.XGBRegressor(n_estimators=100,
                               max_depth=6,
                               objective="reg:squarederror",
                               silent=False)

    parameters = {'reg_alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.1]}
    grid_search = GridSearchCV(estimator=xgb_clf,
                               param_grid=parameters,
                               cv=10,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    print("Best parameters alpha :", grid_search.best_params_)

    xgb.plot_tree(grid_search.best_estimator_, num_trees=1)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(24, 6)
    matplotlib.pyplot.show()
Exemple #20
0
def regression_fit_and_predict( X_train, y_train, X_val, y_val, X_test, num_prev_frames = CONFIG.NUM_PREV_FRAMES ):
  """
  fit regression models
  """
  
  if CONFIG.REGRESSION_MODEL == 'LR':
    
    model = linear_model.LinearRegression()
    model.fit(X_train,y_train)
    
  elif CONFIG.REGRESSION_MODEL == 'LR_L1':
    
    alphas = get_alphas( 50, min_pow = -4.2, max_pow = 0.8 )
    
    min_mse = 1000
    best_model = []
  
    for i in range(len(alphas)):
      model = linear_model.Lasso(alpha=alphas[i], max_iter=1e5, tol=1e-3)
      model.fit(X_train,y_train)
      
      y_val_pred = np.clip( model.predict(X_val), 0, 1 )
      tmp = 1.0/len(y_val) * np.sum( (y_val-y_val_pred)**2 )
      
      if tmp < min_mse:
        min_mse = tmp
        best_model = model
    model = best_model
    
  elif CONFIG.REGRESSION_MODEL == 'LR_L2':
    
    alphas = get_alphas( 50, min_pow = -4.2, max_pow = 0.8 )
    
    min_mse = 1000
    best_model = []
  
    for i in range(len(alphas)):
      model = linear_model.Ridge(alpha=alphas[i], max_iter=1000, tol=1e-3)
      model.fit(X_train,y_train)
      
      y_val_pred = np.clip( model.predict(X_val), 0, 1 )
      tmp = 1.0/len(y_val) * np.sum( (y_val-y_val_pred)**2 )
      
      if tmp < min_mse:
        min_mse = tmp
        best_model = model
    model = best_model
    
  elif CONFIG.REGRESSION_MODEL == 'GB':
    
    if False:
      
      choosen_metrics = 25
      print('number of metrics for gradient boosting:', choosen_metrics)
      coefs_model = np.zeros((X_train.shape[1]))
      
      num_metrics = int(X_train.shape[1]/(num_prev_frames+1))
      coef_metrics = np.zeros((num_metrics))
      
      for k in range(5):
        
        model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4)
        model.fit( X_train, y_train )
        
        importance_metrics = abs(np.array(model.feature_importances_))
        for l in range(num_prev_frames+1):
          coef_metrics += importance_metrics[num_metrics*l:num_metrics*(l+1)]
          
      index_coefs = np.argsort(coef_metrics)[0:choosen_metrics]
      
      X_train_new = np.zeros((X_train.shape[0], choosen_metrics*(num_prev_frames+1)))
      X_val_new = np.zeros((X_val.shape[0], choosen_metrics*(num_prev_frames+1)))
      X_test_new = np.zeros((X_test.shape[0], choosen_metrics*(num_prev_frames+1)))
      
      counter = 0
      for k in range(num_metrics):
        if k in index_coefs:
          for l in range(num_prev_frames+1):
            X_train_new[:,counter] = X_train[:,num_metrics*l+k]
            X_val_new[:,counter] = X_val[:,num_metrics*l+k]
            X_test_new[:,counter] = X_test[:,num_metrics*l+k]
            counter += 1
            
      model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4)
      model.fit( X_train_new, y_train )
      
      importance_metrics = np.array(model.feature_importances_)
      counter = 0
      for k in range(num_metrics):
        if k in index_coefs:
          for l in range(num_prev_frames+1):
            coefs_model[num_metrics*l+k] = importance_metrics[counter]
            counter += 1
          
      X_train = X_train_new.copy()
      X_val = X_val_new.copy()
      X_test = X_test_new.copy()
      
    else:
      model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4)
      model.fit( X_train, y_train )
  
  elif CONFIG.REGRESSION_MODEL == 'NN_L1':
    
    num_metrics = int(X_train.shape[1]/(num_prev_frames+1))
    # (components, num_prev_frames+1, number of metrics)
    X_train = X_train.reshape(X_train.shape[0], num_prev_frames+1, num_metrics )
    X_val = X_val.reshape(X_val.shape[0], num_prev_frames+1, num_metrics )
    X_test = X_test.reshape(X_test.shape[0], num_prev_frames+1, num_metrics )
    
    print('X_train and X_val shape', X_train.shape, X_val.shape)
    
    input_shape  = (X_train.shape[1], X_train.shape[2])
    inp = Input(input_shape)
    weight=1e-4
    dropout=0.25
    
    y = inp
    y = Conv1D(filters=16, kernel_size=(5,), padding='same', strides=1,
              kernel_regularizer=regularizers.l1(weight), activation='relu')(inp)
    y = Flatten()(y)
    y = Dense( 50, kernel_regularizer=regularizers.l1(weight), activation='relu' )(y)
    y = Dense( 1 )(y)
    
    model = Model(inputs=inp,outputs=y)
    model.summary()

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[])
    model.fit(X_train, y_train, epochs=200, validation_data=(X_val,y_val), batch_size=128) 
    
  elif CONFIG.REGRESSION_MODEL == 'NN_L2':
    
    num_metrics = int(X_train.shape[1]/(num_prev_frames+1))
    # (components, num_prev_frames+1, number of metrics)
    X_train = X_train.reshape(X_train.shape[0], num_prev_frames+1, num_metrics )
    X_val = X_val.reshape(X_val.shape[0], num_prev_frames+1, num_metrics )
    X_test = X_test.reshape(X_test.shape[0], num_prev_frames+1, num_metrics )
    
    print('X_train and X_val shape', X_train.shape, X_val.shape)
    
    input_shape  = (X_train.shape[1], X_train.shape[2])
    inp = Input(input_shape)
    wdecay=1e-3
    dropout=0.25
    
    y = inp
    y = Conv1D(filters=16, kernel_size=(5,), padding='same', strides=1,
              kernel_regularizer=regularizers.l2(wdecay), activation='relu')(inp)
    y = Flatten()(y)
    y = Dense( 50, kernel_regularizer=regularizers.l2(wdecay), activation='relu' )(y)
    y = Dense( 1 )(y)
    
    model = Model(inputs=inp,outputs=y)
    model.summary()

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[])
    model.fit(X_train, y_train, epochs=200, validation_data=(X_val,y_val), batch_size=128) 
    print(model.summary())
     
  y_train_pred = np.clip( model.predict(X_train), 0, 1 )
  y_val_pred = np.clip( model.predict(X_val), 0, 1 )
  y_test_R_pred = np.clip( model.predict(X_test), 0, 1 )

  return y_train_pred, y_val_pred, y_test_R_pred, model
     tree.DecisionTreeClassifier(**TREE_PARAMS),
     utils.train_model_classification,
 ),
 (
     "regression", "random_forest",
     ensemble.RandomForestRegressor(**FOREST_PARAMS),
     utils.train_model_regression,
 ),
 (
     "classification", "random_forest",
     ensemble.RandomForestClassifier(**FOREST_PARAMS),
     utils.train_model_classification,
 ),
 (
     "regression", "xgboost",
     xgboost.XGBRegressor(**XGBOOST_PARAMS),
     utils.train_model_regression,
 ),
 (
     "classification", "xgboost",
     xgboost.XGBClassifier(**XGBOOST_PARAMS),
     utils.train_model_classification,
 ),
 (
     "regression", "lightgbm",
     lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS),
     utils.train_model_regression,
 ),
 (
     "classification", "lightgbm",
     lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS),
Exemple #22
0
    def search(self,
               feature,
               label,
               vaild_data=None,
               sample_weight=None,
               metrics=mean_squared_error,
               loss='reg:squarederror',
               scoring=0.5,
               cv=5,
               cv_num=3,
               metrics_min=True,
               speedy=True,
               speedy_param=(20000, 0.3),
               gpu_id=-1,
               save_model_dir=None,
               save_model_name='xgb'):
        """XGBRegressor model params search use GridSearch method.

        Args:
            feature: pandas dataframe, model's feature.
            label: pandas series, model's label.
            vaild_data: A list of (X, y, sample_weight) tuple pairs to use as validation sets, for which metrics will be computed. 
            sample_weight: pd.Series or np.array, sample weight, shape is (n,).
            metrics: model metrics function, default is `la.metircs.mean_squared_error`.
            loss: XGBRegressor param 'objective'.
            scoring: metrics error opt base line value.
            cv: cross validation fold.
            cv_num: if use speedy method, minimum cross validation fold.
            metrics_min: metrics value whether the smaller the better.
            speedy: whether use speedy method.
            speedy_param: if use speedy method, test_size will be set, 
                          test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
            gpu_id: int, use gpu device ordinal, -1 is not use gpu.
            save_model_dir: str, save model folder.
            save_model_name: str, save model name prefix, "`xgb`_model.json" and "`xgb`_params.json".
        Returns:
            a best XGBRegressor model params dict.
        Raises:
            params error.
        """
        import warnings
        warnings.filterwarnings("ignore")
        import xgboost as xgb
        assert xgb.__version__ >= __xgboost_version__, f'xgboost version should be >={__xgboost_version__}.'
        logger = Logger(name='xgb')
        logger.info(f"api is deprecated and will be removed in 1.5.0")
        logger.info(f"please use la.param_search.GridSearch")
        if speedy:
            test_size = 1 - round(
                min(speedy_param[0], feature.shape[0] * speedy_param[1]) /
                feature.shape[0], 2)
        tree_method = ['gpu_hist'] if gpu_id > -1 else [
            'auto', 'exact', 'approx', 'hist'
        ]
        n_job = int(np.ceil(cpu_count() * 0.8))

        self.HyperParameter.Choice('n_jobs', [n_job])
        self.HyperParameter.Choice('objective', [loss])
        self.HyperParameter.Choice('tree_method', tree_method)
        self.HyperParameter.Choice('gpu_id', [gpu_id])

        if vaild_data is not None:
            cv_score_list = []

        logger.info(f"Start XGBRegressor hyperparameter grid search.")
        nums = self.HyperParameter.cardinality()
        for i in range(1, nums + 1):
            self.HyperParameter.update(self.best_params)
            model = xgb.XGBRegressor(**self.HyperParameter.params)
            score = []
            if speedy:
                for _ in range(cv_num):
                    index_list = train_test_split(feature,
                                                  test_size=test_size,
                                                  shuffle=True,
                                                  seed=np.random.choice(
                                                      range(100), 1)[0])
                    weight = None if sample_weight is None else sample_weight[
                        index_list[0]]
                    model.fit(feature.loc[index_list[0]],
                              label[index_list[0]],
                              sample_weight=weight)
                    cv_pred = pd.Series(model.predict(
                        feature.loc[index_list[1]]),
                                        index=label[index_list[1]].index)
                    if sample_weight is None:
                        score.append(metrics(label[index_list[1]], cv_pred))
                    else:
                        score.append(
                            metrics(label[index_list[1]],
                                    cv_pred,
                                    sample_weight=sample_weight))
            else:
                index_list = kfold(feature,
                                   n_splits=cv,
                                   shuffle=True,
                                   seed=np.random.choice(range(100), 1)[0])
                for n, index in enumerate(index_list):
                    weight = None if sample_weight is None else sample_weight[
                        index[0]]
                    model.fit(feature.loc[index[0]],
                              label[index[0]],
                              sample_weight=weight)
                    cv_pred = pd.Series(model.predict(feature.loc[index[1]]),
                                        index=label[index[1]].index)
                    if sample_weight is None:
                        score.append(metrics(label[index[1]], cv_pred))
                    else:
                        score.append(
                            metrics(label[index[1]],
                                    cv_pred,
                                    sample_weight=sample_weight))
            cv_score = np.mean(score)
            if vaild_data is not None:
                cv_score_list.append(cv_score)
                if metrics_min:
                    cv_score_list.sort()
                    if cv_score_list[int(
                            len(cv_score_list) * 0.2)] >= cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]),
                                            index=vaild_data[1].index)
                        if len(vaild_data) == 2:
                            cv_score = metrics(vaild_data[1], cv_pred)
                        else:
                            cv_score = metrics(vaild_data[1],
                                               cv_pred,
                                               sample_weight=vaild_data[2])
                    else:
                        logger.info(
                            f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}",
                            enter=False if i < nums else True)
                        continue
                else:
                    cv_score_list.sort(reverse=1)
                    if cv_score_list[int(
                            len(cv_score_list) * 0.2)] <= cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]),
                                            index=vaild_data[1].index)
                        cv_score = metrics(vaild_data[1], cv_pred)
                    else:
                        logger.info(
                            f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}",
                            enter=False if i < nums else True)
                        continue
            if metrics_min:
                if cv_score < scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {
                        'score': scoring,
                        'best_params': self.best_params.copy()
                    }
                    if save_model_dir is not None:
                        model.save_model(
                            os.path.join(save_model_dir,
                                         f"{save_model_name}_model.json"))
                        with open(
                                os.path.join(save_model_dir,
                                             f"{save_model_name}_params.json"),
                                'w') as f:
                            json.dump(best_params, f)
            else:
                if cv_score > scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {
                        'score': scoring,
                        'best_params': self.best_params.copy()
                    }
                    if save_model_dir is not None:
                        model.save_model(
                            os.path.join(save_model_dir,
                                         f"{save_model_name}_model.json"))
                        with open(
                                os.path.join(save_model_dir,
                                             f"{save_model_name}_params.json"),
                                'w') as f:
                            json.dump(best_params, f)
            logger.info(
                f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}",
                enter=False if i < nums else True)
        logger.info(f"XGBRegressor grid search best score: {scoring:.4f}",
                    close=True,
                    time_mode=1)
        return self.best_params
#### ALL INPUTS FEATURES

# # # - - - XGB - - - # # #
model = 'xgb3'
best_parameters3 = {
    'colsample_bytree': 0.85,
    'learning_rate': 0.02,
    'max_depth': 10,
    'min_child_weight': 3,
    'n_estimators': 700,
    'nthread': 4,
    'objective': 'reg:linear',
    'silent': 1,
    'subsample': 0.85
}
regression = xgb.XGBRegressor(**best_parameters3)

X_train = df.drop(['calories_per_ha'], axis=1)
y_train = df['calories_per_ha']
regression.fit(X_train, y_train)
save_model('../ipbes_invest_crop_yield_project/output/Models/' + model +
           '.sav')
y_predicted = regression.predict(X_validation)

R2_validation = sklearn.metrics.r2_score(y_validation, y_predicted)
validation_R2 = validation_R2.append(
    {
        'Model': model,
        'Validation_R2': R2_validation
    }, ignore_index=True)
Exemple #24
0
from sklearn.feature_selection import SelectFromModel
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
dataset = load_boston()
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)
model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.01)
model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric='rmse',
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)

result = model.evals_result()
print(result)
y_pred = model.predict(x_test)
score = r2_score(y_test, y_pred)
print(score)
Exemple #25
0
    lrs = [
        linear_model.BayesianRidge(),
        linear_model.ARDRegression(),
        linear_model.ElasticNet(),
        linear_model.HuberRegressor(),
        linear_model.Lars(),
        linear_model.LinearRegression(),
        linear_model.LogisticRegression(),
        linear_model.PassiveAggressiveRegressor(),
        linear_model.RandomizedLogisticRegression(),
        linear_model.RANSACRegressor(),
        linear_model.Ridge(),
        linear_model.SGDRegressor(),
        linear_model.TheilSenRegressor(),
        xgb.XGBRegressor(learning_rate=0.1, reg_alpha=1),
        xgb.XGBRegressor(learning_rate=0.2, reg_alpha=1),
        xgb.XGBRegressor(learning_rate=0.2),
        ensemble.AdaBoostRegressor(),
        ensemble.BaggingRegressor(),
        ensemble.ExtraTreesRegressor(n_estimators=100),
        ensemble.GradientBoostingRegressor(),
        ensemble.RandomForestRegressor(n_estimators=100)
    ]
    best_lr = None
    lr = xgb.XGBRegressor(learning_rate=0.1, max_depth=2,
                          reg_alpha=1)  #linear_model.BayesianRidge()
    cv = model_selection.cross_val_score(lr,
                                         v1,
                                         v.y,
                                         cv=10,
gbm_gridsearch.fit(x_train_values, y_train_values)
gbm_best_model_predictions = gbm_gridsearch.best_estimator_.predict(
    x_test_values)
generate_submission_file(
    gbm_best_model_predictions, test_data["Id"],
    "../results/" + user + "_Gradient_Boosted_Machines_GridSearchCV.csv")

#####################################################################
## XGBoost                                                         ##
#####################################################################

#####################################################################
### Weak Learner is a Tree                                        ###
#####################################################################

xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train_values, y_train_values)
xgb_model_predictions = xgb_model.predict(x_test_values)
generate_submission_file(xgb_model_predictions, test_data["Id"],
                         "../results/" + user + "_XGBoost_Basic.csv")

param_grid = {
    "max_depth": [2, 4, 6],
    "n_estimators": np.linspace(100, 500, 5, dtype=np.int64)
}
xgb_grid_search = GridSearchCV(xgb.XGBRegressor(objective="reg:linear"),
                               param_grid)
xgb_grid_search.fit(x_train_values, y_train_values)
xgb_model_predictions = xgb_grid_search.predict(x_test_values)
generate_submission_file(xgb_model_predictions, test_data["Id"],
                         "../results/" + user + "_XGBoost_GridSearchCV.csv")
class HousePrices(object):
    seq2 = pd.Series(np.arange(2))

    #Static class models.
    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(),
                         ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
                                 gamma=0.0468,
                                 learning_rate=0.05,
                                 max_depth=3,
                                 min_child_weight=1.7817,
                                 n_estimators=2200,
                                 reg_alpha=0.4640,
                                 reg_lambda=0.8571,
                                 subsample=0.5213,
                                 silent=1,
                                 random_state=7,
                                 nthread=-1)
    GBoost = GradientBoostingRegressor(n_estimators=3000,
                                       learning_rate=0.05,
                                       max_depth=4,
                                       max_features='sqrt',
                                       min_samples_leaf=15,
                                       min_samples_split=10,
                                       loss='huber',
                                       random_state=5)
    model_lgb = lgb.LGBMRegressor(objective='regression',
                                  num_leaves=5,
                                  learning_rate=0.05,
                                  n_estimators=720,
                                  max_bin=55,
                                  bagging_fraction=0.8,
                                  bagging_freq=5,
                                  feature_fraction=0.2319,
                                  feature_fraction_seed=9,
                                  bagging_seed=9,
                                  min_data_in_leaf=6,
                                  min_sum_hessian_in_leaf=11)
    KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

    #Constructor
    def __init__(self, trainData, testData):
        self.trainData = trainData
        self.testData = testData

    def dataImport(self):
        self.train = pd.read_csv(self.trainData)
        self.test = pd.read_csv(self.testData)
        self.train_Id = self.train['Id']
        self.test_Id = self.test['Id']
        self.train.drop("Id", axis=1, inplace=True)
        self.test.drop("Id", axis=1, inplace=True)

    def display(self):
        print(len(self.train.columns))
        fig, ax = plt.subplots()
        ax.scatter(x=self.train['GrLivArea'], y=self.train['SalePrice'])
        plt.ylabel('SalePrice', fontsize=13)
        plt.xlabel('GrLivArea', fontsize=13)
        #plt.show()

        # corrmat = self.train.corr()
        # f, ax = plt.subplots(figsize=(12, 9))
        # sns.heatmap(self.corrmat, vmax=.8, square=True);
        plt.show()

        # sns.distplot(self.train['SalePrice'] , fit=norm);

        # # Get the fitted parameters used by the function
        # (mu, sigma) = norm.fit(self.train['SalePrice'])
        # print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

        # #Now plot the distribution
        # plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
        # plt.ylabel('Frequency')
        # plt.title('SalePrice distribution')

        # #Get also the QQ-plot
        # fig = plt.figure()
        # res = stats.probplot(self.train['SalePrice'], plot=plt)
        # plt.show()

        # f, ax = plt.subplots(figsize=(15, 12))
        # plt.xticks(rotation='90')
        # sns.barplot(x=self.all_data_na.index, y=self.all_data_na)
        # plt.xlabel('Features', fontsize=15)
        # plt.ylabel('Percent of missing values', fontsize=15)
        # plt.title('Percent missing data by feature', fontsize=15)

        #plt.show()

    def removeOutliers(self):
        self.train = self.train.drop(
            self.train[(self.train['GrLivArea'] > 4000)
                       & (self.train['SalePrice'] < 300000)].index)

    def preProcess(self):
        self.removeOutliers()

        self.train['SalePrice'] = np.log1p(self.train['SalePrice'])
        self.ntrain = self.train.shape[0]
        self.ntest = self.test.shape[0]
        self.y_train = self.train.SalePrice.values
        self.all_data = pd.concat(
            (self.train, self.test)).reset_index(drop=True)
        self.all_data.drop(['SalePrice'], axis=1, inplace=True)
        print("all_data size is : {}".format(self.all_data.shape))

        self.all_data_na = (self.all_data.isnull().sum() /
                            len(self.all_data)) * 100
        self.all_data_na = self.all_data_na.drop(
            self.all_data_na[self.all_data_na == 0].index).sort_values(
                ascending=False)[:30]
        self.missing_data = pd.DataFrame({'Missing Ratio': self.all_data_na})

        self.preprocessCategoricalColumns()
        self.preProcessNumericalColumns()

    def preprocessCategoricalColumns(self):
        #Converting PoolQC column to categorical and then using a probability distribution to fill the None values.

        print("Total Number of values ", self.all_data['PoolQC'].shape[0])
        print("Number of Null Values", self.all_data['PoolQC'].isna().sum())

        #
        #				PoolQC
        #
        #

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["PoolQC"] = self.all_data.PoolQC.fillna("None")
        self.all_data['PoolQC'] = pd.Categorical(self.all_data.PoolQC)

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['PoolQC'].value_counts())

        self.poolQC_probabilities = [
            0.98, 0.006666667, 0.006666667, 0.006666667
        ]
        self.poolQC_Values = ['None', 'Gd', 'Fa', 'Ex']
        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['PoolQC'] == 'None'].index

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           65] = np.random.choice(self.poolQC_Values,
                                                  len(self.indices),
                                                  p=self.poolQC_probabilities)

        print("After filling :")
        print(self.all_data.PoolQC.value_counts())

        ############################################################################################

        #
        #				MiscFeature
        #
        #
        #Number of Missing values in MiscFeature
        self.all_data.MiscFeature.isna().sum(
        )  #  1404 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["MiscFeature"] = self.all_data['MiscFeature'].fillna(
            "None")
        self.all_data['MiscFeature'] = pd.Categorical(
            self.all_data['MiscFeature'])
        self.all_data.MiscFeature = self.all_data.MiscFeature.astype(
            'category')

        # print("Before Filling :")
        # print(self.all_data['MiscFeature'].value_counts())

        # (2) Finding probabilities of each occurance
        print(self.all_data['MiscFeature'].value_counts())
        self.MiscFeature_probabilities = [
            0.962962963, 0.033607682, 0.001371742, 0.001371742, 0.000685871
        ]
        self.MiscFeature_Values = ['None', 'Shed', 'Othr', 'Gar2', 'TenC']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['MiscFeature'] ==
                                     'None'].index
        #Find the column index so as to use 'iloc'   . 56 is the col
        np.argwhere(self.all_data.columns == 'MiscFeature')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices, 56] = np.random.choice(
            self.MiscFeature_Values,
            len(self.indices),
            p=self.MiscFeature_probabilities)

        # print("After filling")
        # print(self.all_data["MiscFeature"].value_counts())

        ############################################################################################

        #
        #				Alley
        #
        #

        #Number of Missing values in Alley
        self.all_data['Alley'].isna().sum()  #  1367 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["Alley"] = self.all_data['Alley'].fillna("None")
        self.all_data['Alley'] = pd.Categorical(self.all_data['Alley'])

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['Alley'].value_counts())

        # Count of 'None' : 1367
        # Count of 'Grvl' : 50
        # Count of 'Pave' : 41

        self.Alley_probabilities = [0.937585734, 0.034293553, 0.028120713]
        self.Alleyy_Values = ['None', 'Grvl', 'Pave']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['Alley'] == 'None'].index
        #Find the column index so as to use 'iloc'   . 3 is the col
        np.argwhere(self.all_data.columns == 'Alley')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           3] = np.random.choice(self.Alleyy_Values,
                                                 len(self.indices),
                                                 p=self.Alley_probabilities)
        print("gg")
        self.all_data['Alley'].value_counts()

        print("After filling :")
        print(self.all_data['Alley'].value_counts())

        ###########################################################################################

        #
        #				Fence
        #
        #

        #Number of Missing values in Alley
        self.all_data['Fence'].isna().sum()  #  1177 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["Fence"] = self.all_data['Fence'].fillna("None")
        self.all_data['Fence'] = pd.Categorical(self.all_data['Fence'])

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['Fence'].value_counts())

        # Count of 'None' : 1177
        # Count of 'MnPrv' : 157
        # Count of 'GdPrv' : 59
        # Count of 'GdWo' : 54
        # Count of 'MnWw' : 11

        self.Fence_probabilities = [
            0.807270233, 0.107681756, 0.040466392, 0.037037037, 0.007544582
        ]
        self.Fence_Values = ['None', 'MnPrv', 'GdPrv', 'GdWo', 'MnWw']
        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['Fence'] == 'None'].index
        #Find the column index so as to use 'iloc'   . 25 is the col
        np.argwhere(self.all_data.columns == 'Fence')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           25] = np.random.choice(self.Fence_Values,
                                                  len(self.indices),
                                                  p=self.Fence_probabilities)

        print("After filling :")
        print(self.all_data['Fence'].value_counts())

        #########################################################################################

        #
        #				FirePlaceQu
        #
        #

        #Number of Missing values in FireplaceQu
        self.all_data['FireplaceQu'].isna().sum(
        )  #  690 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["FireplaceQu"] = self.all_data['FireplaceQu'].fillna(
            "None")
        self.all_data['FireplaceQu'] = pd.Categorical(
            self.all_data['FireplaceQu'])

        # (2) Finding probabilities of each occurance
        print("Before filling :")
        print(self.all_data['FireplaceQu'].value_counts())

        # Count of 'None' : 690
        # Count of 'Gd' : 378
        # Count of 'TA' : 313
        # Count of 'Fa' : 33
        # Count of 'Ex' : 24
        # Count of 'Po' : 20

        self.FireplaceQu_probabilities = [
            0.473251029, 0.259259259, 0.214677641, 0.022633745, 0.016460905,
            0.013717421
        ]
        self.FireplaceQu_Values = ['None', 'Gd', 'TA', 'Fa', 'Ex', 'Po']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['FireplaceQu'] ==
                                     'None'].index

        #Find the column index so as to use 'iloc'   . 26 is the col
        np.argwhere(self.all_data.columns == 'FireplaceQu')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices, 26] = np.random.choice(
            self.FireplaceQu_Values,
            len(self.indices),
            p=self.FireplaceQu_probabilities)

        print("After filling :")
        print(self.all_data['FireplaceQu'].value_counts())

        ###########################################################################################

        #
        #				LotFrontage
        #
        #
        '''
		Assuming houses belonging to the same Neighborhood will have similar LotFrontage, we groupby Neighborhood
		and then take mean for each locality. Then we substitute the missing values of a particular Neighborhood with
		the mean of that Neighborhood
		'''

        self.lotFrontage_df = self.all_data[['Neighborhood',
                                             'LotFrontage']].copy()
        self.groupby_Neighborhood = self.lotFrontage_df.groupby('Neighborhood')

        self.indices = self.all_data[self.all_data['LotFrontage'].isna()].index

        self.mean_Neighborhood = self.groupby_Neighborhood.mean()
        self.mean_Neighborhood.head()

        for i in self.indices:
            self.locality = self.all_data.iloc[i, 59]
            self.value = self.mean_Neighborhood.get_value(
                self.locality, 'LotFrontage')
            self.all_data.iloc[i, 49] = self.value

        ###########################################################################################

        #
        #
        #	 (6)GarageYrBlt (7) GarageArea (8) GarageCar
        #
        #   (9)GarageType (10) GarageFinish (11) GarageQual (12)GarageCond

        for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
            self.all_data[col] = self.all_data[col].fillna(0)

        self.all_data['GarageType'] = self.all_data['GarageType'].fillna(
            'None')
        self.all_data['GarageFinish'] = self.all_data['GarageFinish'].fillna(
            'None')
        self.all_data['GarageQual'] = self.all_data['GarageQual'].fillna(
            'None')
        self.all_data['GarageCond'] = self.all_data['GarageCond'].fillna(
            'None')

        for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                    'BsmtFullBath', 'BsmtHalfBath'):
            self.all_data[col] = self.all_data[col].fillna(0)

        for col in ('BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                    'BsmtQual'):
            self.all_data[col] = self.all_data[col].fillna('None')

        #############################################################################################

        #
        #
        #	 Electrical , Exterior1st,Exterior2nd,SaleType,KitchenQual
        #
        #

        #Electrical has only 1 Null value , hence replacing by most frequently occuring value i.e. mode of the column

        self.all_data['Electrical'] = self.all_data['Electrical'].fillna(
            self.all_data['Electrical'].mode()[0])

        #Similarly for Exterior1st, Exterior2nd,SaleType and KitchenQual
        self.all_data['Exterior1st'] = self.all_data['Exterior1st'].fillna(
            self.all_data['Exterior1st'].mode()[0])
        self.all_data['Exterior2nd'] = self.all_data['Exterior2nd'].fillna(
            self.all_data['Exterior2nd'].mode()[0])
        self.all_data['KitchenQual'] = self.all_data['KitchenQual'].fillna(
            self.all_data['KitchenQual'].mode()[0])
        self.all_data['SaleType'] = self.all_data['SaleType'].fillna(
            self.all_data['SaleType'].mode()[0])

        ##############################################################################################

        #
        #
        #
        #    'MasVnrArea','MasVnrType' and other columns
        #
        #

        self.indices = self.all_data[self.all_data['MasVnrArea'] == 0].index

        self.all_data['MasVnrArea'] = self.all_data['MasVnrArea'].fillna(0)
        self.all_data['MasVnrType'] = self.all_data['MasVnrType'].fillna(
            'None')
        self.all_data = self.all_data.drop(['Utilities'], axis=1)

        self.all_data["Functional"] = self.all_data["Functional"].fillna("Typ")
        self.all_data['MSSubClass'] = self.all_data['MSSubClass'].fillna(
            "None")

        ##############################################################################################

        # Hence no remaining Columns with missing values.

        # MSSubClass is categorical as only a certain set of numbers are appearing. Hence converting it to categorical

        # OverallCond is categorical as only a certain set of numbers are appearing. Hence converting it to categorical

        self.all_data['MSSubClass'].unique()
        #array([ 20, 180,  60,  80,  50,  75,  30,  70,  90, 120,  45, 190,  85,  160,  40])

        self.all_data['MSSubClass'] = self.all_data['MSSubClass'].apply(str)

        self.all_data['OverallCond'].unique()
        #array([6, 5, 7, 8, 3, 4, 9, 2, 1])

        self.all_data['OverallCond'] = self.all_data['OverallCond'].apply(str)

        #Unlike Yrbuilt , YrSold is taking only a set of numbers converting it to categorical.
        self.all_data['YrSold'].unique()
        #array([2008, 2006, 2010, 2007, 2009])

        self.all_data['YrSold'] = self.all_data['YrSold'].astype(str)

        #Similarly for MonthSold ie MoSold
        self.all_data['MoSold'].unique()
        #array([ 5,  6,  3,  4, 12,  7,  8, 11,  1, 10,  2,  9])

        self.all_data['MoSold'] = self.all_data['MoSold'].astype(str)

        #	 Linear regression works only on columns with numeric values , Using labelEncoder to convert
        #	the categorical colums to a numeric values

        #Set of columns which have categorical values:

        self.columns = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual',
                        'GarageCond', 'ExterQual', 'ExterCond', 'HeatingQC',
                        'PoolQC', 'KitchenQual', 'BsmtFinType1',
                        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure',
                        'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive',
                        'Street', 'Alley', 'CentralAir', 'MSSubClass',
                        'OverallCond', 'YrSold', 'MoSold')

        for column in self.columns:
            self.lbl = LabelEncoder()
            self.lbl.fit(list(self.all_data[column].values))
            self.all_data[column] = self.lbl.transform(
                list(self.all_data[column].values))

        # skewness = skewness[abs(skewness) > 0.75]
        # print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

        # from scipy.special import boxcox1p
        # self.skewed_features = skewness.index
        # lam = 0.15
        # for feat in self.skewed_features:
        #     #all_data[feat] += 1
        #     self.all_data[feat] = boxcox1p(self.all_data[feat], self.lam)

        # This will map the labels of categorical data to 0,1,2,3 etc.
        self.all_data = pd.get_dummies(self.all_data)

    def preProcessNumericalColumns(self):
        #These features are positively correlated with the salePrice hence creating new features by
        #taking 3 polynomials square, cube and square root

        # Taking the top 10 correlated valuse.

        # OverallQual    0.817315
        # GrLivArea      0.715624
        # GarageCars     0.687771
        # GarageArea     0.662332
        # TotalBsmtSF    0.637558
        # 1stFlrSF       0.608198
        # FullBath       0.582020
        # YearBuilt      0.572574

        # As total square feet is important. Adding total sqfootage feature
        self.all_data[
            'TotalSF'] = self.all_data['TotalBsmtSF'] + self.all_data[
                '1stFlrSF'] + self.all_data['2ndFlrSF']

        self.all_data["OverallQual-s2"] = self.all_data["OverallQual"]**2
        self.all_data["OverallQual-s3"] = self.all_data["OverallQual"]**3
        self.all_data["OverallQual-Sq"] = np.sqrt(self.all_data["OverallQual"])

        self.all_data["GrLivArea-s2"] = self.all_data["GrLivArea"]**2
        self.all_data["GrLivArea-s3"] = self.all_data["GrLivArea"]**3
        self.all_data["GrLivArea-Sq"] = np.sqrt(self.all_data["GrLivArea"])

        self.all_data["GarageCars-s2"] = self.all_data["GarageCars"]**2
        self.all_data["GarageCars-s3"] = self.all_data["GarageCars"]**3
        self.all_data["GarageCars-Sq"] = np.sqrt(self.all_data["GarageCars"])

        self.all_data["GarageArea-s2"] = self.all_data["GarageArea"]**2
        self.all_data["GarageArea-s3"] = self.all_data["GarageArea"]**3
        self.all_data["GarageArea-Sq"] = np.sqrt(self.all_data["GarageArea"])

        self.all_data["TotalBsmtSF-s2"] = self.all_data["TotalBsmtSF"]**2
        self.all_data["TotalBsmtSF-s3"] = self.all_data["TotalBsmtSF"]**3
        self.all_data["TotalBsmtSF-Sq"] = np.sqrt(self.all_data["TotalBsmtSF"])

        self.all_data["1stFlrSF-s2"] = self.all_data["1stFlrSF"]**2
        self.all_data["1stFlrSF-s3"] = self.all_data["1stFlrSF"]**3
        self.all_data["1stFlrSF-Sq"] = np.sqrt(self.all_data["1stFlrSF"])

        self.all_data["FullBath-s2"] = self.all_data["FullBath"]**2
        self.all_data["FullBath-s3"] = self.all_data["FullBath"]**3
        self.all_data["FullBath-Sq"] = np.sqrt(self.all_data["FullBath"])

        self.all_data["YearBuilt-s2"] = self.all_data["YearBuilt"]**2
        self.all_data["YearBuilt-s3"] = self.all_data["YearBuilt"]**3
        self.all_data["YearBuilt-Sq"] = np.sqrt(self.all_data["YearBuilt"])

        self.all_data["TotalSF-s2"] = self.all_data["TotalSF"]**2
        self.all_data["TotalSF-s3"] = self.all_data["TotalSF"]**3
        self.all_data["TotalSF-Sq"] = np.sqrt(self.all_data["TotalSF"])

        self.train = self.all_data[:1020]
        self.test = self.all_data[1020:]

        self.all_data.to_csv('./all.csv')

    #Validation function

    def rmsle_cv(self, model):
        #self.n_folds = 5
        self.kf = KFold(5, shuffle=True,
                        random_state=42).get_n_splits(self.train.values)
        self.rmse = np.sqrt(-cross_val_score(model,
                                             self.train.values,
                                             self.y_train,
                                             scoring="neg_mean_squared_error",
                                             cv=self.kf))
        return (self.rmse)

    #Lasso. Best alpha : 0.0005 / 91% accuracy
    def lasso_model(self):
        self.lasso_m = Lasso()
        self.alpha = [0.0005, 0.0003, 0.0007]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.lasso_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.lasso = self.grid_search.best_estimator_
        # #self.lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
        # #self.score = self.rmsle_cv(self.lasso)
        # self.score = self.rmsle_cv(HousePrices.lasso)
        # print("\nLasso score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    # ElasticNet. Best Alpha : 0.001  / 91% accuracy.
    def elasticNet(self):
        self.enet_m = ElasticNet()
        self.alpha = [0.0005, 0.0007, 0.001]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.enet_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.enet_m = self.grid_search.best_estimator_

        # #self.ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        # self.score = self.rmsle_cv(HousePrices.ENet)
        # print("ElasticNet score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    #Kernel Ridge regression. Best alpha : .0005 / 79% accuracy
    def kernelRegression(self):
        self.krr_m = KernelRidge()
        self.alpha = [0.0005, 0.0007, 0.001, 0.0006, 0.0001]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.krr_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.krr_m = self.grid_search.best_estimator_

        # #self.KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        # self.score = self.rmsle_cv(HousePrices.KRR)
        # print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    #GradientBoosting. Best alpha : .00065 / 89% accuracy
    def gradientBoosting(self):
        self.gboost_m = GradientBoostingRegressor()
        self.alpha = [0.00068, 0.00065, 0.00066]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.gboost_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.krr_m = self.grid_search.best_estimator_

        # #self.GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10,loss='huber', random_state =5)
        # self.score = self.rmsle_cv(HousePrices.GBoost)
        # print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    # XgbRegressor.Best alpha : .0005 / 79% accuracy
    def xgbRegressor(self):
        #self.model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05, max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1)
        self.score = self.rmsle_cv(HousePrices.model_xgb)
        print("Xgboost score: {:.4f} ({:.4f})\n".format(
            self.score.mean(), self.score.std()))

    # LgbRegressor. Best alpha : .0005 / 79% accuracy
    def lgbRegressor(self):
        #model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05, n_estimators=720,max_bin = 55, bagging_fraction = 0.8,bagging_freq = 5, feature_fraction = 0.2319,feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
        self.score = self.rmsle_cv(HousePrices.model_lgb)
        print("LgbRegressor score: {:.4f} ({:.4f})\n".format(
            self.score.mean(), self.score.std()))

    def rmsle(self, y, y_pred):
        return np.sqrt(mean_squared_error(y, y_pred))

    def stackingModels(self):
        #Lasso
        self.lasso_stacking = make_pipeline(
            RobustScaler(), Lasso(alpha=0.0005, random_state=1))
        #ElasticNet
        self.ENet_stacking = make_pipeline(
            RobustScaler(),
            ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        #Kernel Ridge regression
        self.KRR_stacking = KernelRidge(alpha=0.6,
                                        kernel='polynomial',
                                        degree=2,
                                        coef0=2.5)
        #GBoost
        self.GBoost_stacking = GradientBoostingRegressor(n_estimators=3000,
                                                         learning_rate=0.05,
                                                         max_depth=4,
                                                         max_features='sqrt',
                                                         min_samples_leaf=15,
                                                         min_samples_split=10,
                                                         loss='huber',
                                                         random_state=5)

        #Lgb
        self.lgb_stacking = lgb.LGBMRegressor(objective='regression',
                                              num_leaves=5,
                                              learning_rate=0.05,
                                              n_estimators=720,
                                              max_bin=55,
                                              bagging_fraction=0.8,
                                              bagging_freq=5,
                                              feature_fraction=0.2319,
                                              feature_fraction_seed=9,
                                              bagging_seed=9,
                                              min_data_in_leaf=6,
                                              min_sum_hessian_in_leaf=11)

        #Stacking
        self.stacked_averaged_models = StackingAveragedModels(
            base_models=(self.ENet_stacking, self.GBoost_stacking,
                         self.KRR_stacking),
            meta_model=self.lasso_stacking)

        self.score = self.rmsle_cv(self.stacked_averaged_models)
        print("Stacking Averaged models score: {:.4f} ({:.4f})".format(
            self.score.mean(), self.score.std()))

        self.stacked_averaged_models.fit(self.train.values, self.y_train)
        self.stacked_train_pred = self.stacked_averaged_models.predict(
            self.train.values)
        self.stacked_pred = np.expm1(
            self.stacked_averaged_models.predict(self.test.values))
        print("RMSE of stacked ")
        print(self.rmsle(self.y_train, self.stacked_train_pred))
Exemple #28
0
from sklearn.svm import LinearSVC


df = load()
df = df.loc[:70, :]
# print(df)
train_X, test_X, train_Y, test_Y = train_test_split(
    df[['ptt', 'vally_ptt', 'rr1', 'rr2', 'sum1', 'up1', 'down1', 'sum2', 'up2', 'down2']], df['high_pluse'])

# XGBoost
predictor = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=8,
    min_child_weight=1,
    # gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
    scale_pos_weight=1,
    seed=27)
# 线性回归
# predictor = linear_model.LinearRegression()
# 支持向量机 liner 线性  poly 多项式  rbf 径向基
# predictor = SVR(kernel='rbf')

predictor.fit(train_X, train_Y)
y = predictor.predict(test_X)
print('predict Y: ', list(y))
print('real Y: ', list(test_Y))
loss = mean_absolute_error(y, test_Y)
Exemple #29
0
model = xgb.cv(params,
               dtrain,
               num_boost_round=1000,
               early_stopping_rounds=100,
               nfold=5,
               metrics='rmse')
model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot()
print(model['test-rmse-mean'].min())
print(model['train-rmse-mean'].min())

model_xgb = xgb.XGBRegressor(n_estimators=410,
                             learning_rate=0.08,
                             max_depth=2,
                             min_child_weight=3,
                             gamma=0,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             objective='reg:linear',
                             nthread=4,
                             scale_pos_weight=1,
                             seed=27)  #the params were tuned using xgb.cv
model_xgb.fit(train_x, train_y)

xgb_preds = np.expm1(model_xgb.predict(test_x))
lasso_preds = np.expm1(model_lasso.predict(test_x))

lasso_xgb_preds = 0.7 * lasso_preds + 0.3 * xgb_preds
predictions = pd.DataFrame({"xgb": lasso_xgb_preds, "lasso": lasso_preds})
predictions.plot(x="xgb", y="lasso", kind="scatter")

preds = 0.7 * lasso_preds + 0.3 * xgb_preds
Exemple #30
0
    X = np.array([param_dict[params] for params in param_dict]).reshape(2, -1)
    return -(X[1] + 47) * np.sin(np.sqrt(
        np.abs(X[0] / 2.0 + (X[1] + 47)))) - X[0] * np.sin(
            np.sqrt(np.abs(X[0] - (X[1] + 47)))) + np.random.normal(
                0, 0.2, len(X[0]))


if __name__ == '__main__':
    file_id = time.time()
    step = 30
    n = 200

    eval_func = eggholder_function
    input_domain = [-212, 212]

    xgb = xgboost.XGBRegressor(verbosity=0)
    init_df = None

    explorer = Explorer(
        {
            'param1': RandomFloat(input_domain[0], input_domain[1]),
            'param2': RandomFloat(input_domain[0], input_domain[1]),
        },
        path="data/out_%d.csv" % file_id)

    for i in range(0, n, step):
        init_df = explorer.explore(step, eval_func, init_n=5)

        X, y = init_df.iloc[:, :-1].values, init_df.iloc[:, -1].values

        print("Number of data points : %d" % X.shape[0])