Ejemplo n.º 1
0
def test_boston_housing_rf_regression():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRFRegressor(random_state=42).fit(
            X[train_index], y[train_index])
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
        assert mean_squared_error(preds, labels) < 35
Ejemplo n.º 2
0
def optimal_model(df):

    clf_list = [RandomForestRegressor(), xgb.XGBRFRegressor()] #GradientBoostingRegressor(), LinearRegression(), 
    #######################################################
    n_folds = 4
    X = df.drop('count', axis = 1)
    y = df['count']
    kf = KFold(n_folds,shuffle=True, random_state = 42).get_n_splits(X)
    n_estimatorslist = [100, 300,500]
    maxdepthList = [3, 5, 10]
    learning_ratelist = [0.01,0.1,1]
    colsample_bytreelist = [0.4,0.6]
    gammalist = [0]
    gridBool = [True, False]
    bestscorelist = []
    best_searchlist = []
    ###################################################
    param_GridList = [
    # [{'fit_intercept': gridBool}],  # Linear regressor
        [{'n_estimators':n_estimatorslist,
        'max_depth': maxdepthList}], # for Random Forest
        [{'n_estimators':n_estimatorslist,
        'maxdepth': maxdepthList,
        'learning_rate':learning_ratelist,
        'colsample_bytree':colsample_bytreelist,
        'gamma': gammalist}] #xgb
        ]
    #####################################################
    def my_scoring(y_true, y_pred):
        error = np.sqrt(mean_squared_error(np.log(y_true) + 1, np.log(y_pred) + 1))
        return error
    my_scoring = make_scorer(my_scoring, greater_is_better=False)
    #####################################################
    for clf, params in zip(clf_list, param_GridList):
        best_search = GridSearchCV(estimator= clf,
                                param_grid=params,
                                cv=kf,
                                scoring = my_scoring)
        best_search.fit(X,y)
        bestParams = best_search.best_params_
        bestscore = round(np.sqrt(-best_search.best_score_),5)
        bestscorelist.append(bestscore)
        best_searchlist.append(best_search)
        print('model, the best params are {}, the best score is {}'
            .format(bestParams,bestscore))

    return bestscorelist, best_search
Ejemplo n.º 3
0
def test_num_parallel_tree():
    from sklearn.datasets import load_boston
    reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4,
                           tree_method='hist')
    boston = load_boston()
    bst = reg.fit(X=boston['data'], y=boston['target'])
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 16

    reg = xgb.XGBRFRegressor(n_estimators=4)
    bst = reg.fit(X=boston['data'], y=boston['target'])
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 4

    config = json.loads(bst.get_booster().save_config())
    assert int(config['learner']['gradient_booster']['gbtree_train_param'][
        'num_parallel_tree']) == 4
Ejemplo n.º 4
0
    def test_xgboost_regressor_unwrapped(self):
        """
        Validate xgboost regressor without wrapper
        """
        X, y = make_regression(n_samples=500,
                               n_features=22,
                               n_informative=8,
                               random_state=8311982)
        X_train, X_test, y_train, y_test = tts(X, y)

        model = xgb.XGBRFRegressor()
        oz = residuals_plot(model,
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            show=False)
        assert is_fitted(oz)
Ejemplo n.º 5
0
def test_num_parallel_tree():
    from sklearn.datasets import fetch_california_housing
    reg = xgb.XGBRegressor(n_estimators=4,
                           num_parallel_tree=4,
                           tree_method='hist')
    X, y = fetch_california_housing(return_X_y=True)
    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 16

    reg = xgb.XGBRFRegressor(n_estimators=4)
    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 4

    config = json.loads(bst.get_booster().save_config())
    assert int(config['learner']['gradient_booster']['gbtree_train_param']
               ['num_parallel_tree']) == 4
Ejemplo n.º 6
0
    def average_score(self, params):
        forrest = self.forrest
        x_train = self.x_train
        x_test = self.x_test
        y_train = self.y_train
        y_test = self.y_test
        features = self.features 
        targets = self.targets
        split = self.split

        '''
        this method tries the first 50 random seeds for the train test split,
        then returns an average of the R^2s across the random seeds
        params : this parameter will usualy be the value of the best param
        method, but you can enter any paramter you want, as long as they
        are included in the xgboost.Regression documentation 
        '''
        
        nums = []
        for num in range(0,50):
            try:
                x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split, random_state = num)
                if forrest == False:
                    xgb_r = xg.XGBRegressor(**params)
                else:
                    xgb_r = xg.XGBRFRegressor(**params)

                # Fitting the model 
                xgb_r.fit(x_train, y_train) 

                # Predict the model 
                pred = xgb_r.predict(x_test) 

                r2  = metrics.r2_score(y_test.values, pred)
                nums.append(r2)


            except Exception as e:
                print(e)
                continue 
            
        self.best_random_seed = max(nums)
        return (sum(nums)/len(nums))
Ejemplo n.º 7
0
    def r2_graph(self):
        '''
        plots the predicted value aginst the actual value for both x_train, and x_test
        '''
        x_train = self.x_train
        x_test = self.x_test
        y_train = self.y_train
        y_test = self.y_test
        forrest = self.forrest
        params = self.params
        best_param = self.best_grid
        features = self.features 
        targets = self.targets
        split = self.split

        x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split,
                                                             random_state = 24)
        if forrest == False:
            xgb_r = xg.XGBRegressor(**best_param)
        else:
            xgb_r = xg.XGBRFRegressor(**best_param)

        # Fitting the model 
        xgb_r.fit(x_train, y_train) 

        # Predict the model 
        pred_test = xgb_r.predict(x_test)
        pred_train =  xgb_r.predict(x_train)

        r2_test  = metrics.r2_score(y_test.values, pred_test)
        r2_train  = metrics.r2_score(y_train.values, pred_train)

        #print(num)

                
        plt.scatter(pred_test,y_test, marker = "D", color = 'blue', label = 'Test Data')
        plt.scatter(pred_train,y_train, marker = "D", color = 'red', label = 'Train Data')
        
        plt.xlabel('Predicted Value')
        plt.ylabel('Actual Value')
        plt.legend()
        plt.show()
Ejemplo n.º 8
0
def test_num_parallel_tree():
    from sklearn.datasets import fetch_california_housing

    reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist")
    X, y = fetch_california_housing(return_X_y=True)
    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format="json")
    assert len(dump) == 16

    reg = xgb.XGBRFRegressor(n_estimators=4)
    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format="json")
    assert len(dump) == 4

    config = json.loads(bst.get_booster().save_config())
    assert (
        int(
            config["learner"]["gradient_booster"]["gbtree_model_param"][
                "num_parallel_tree"
            ]
        )
        == 4
    )
Ejemplo n.º 9
0
def test_regression_random_forest():
    base_score = 0.6
    estimator = xgboost.XGBRFRegressor(n_estimators=2,
                                       random_state=1,
                                       max_depth=1,
                                       base_score=base_score)
    utils.get_regression_model_trainer()(estimator)

    assembler = assemblers.XGBoostModelAssemblerSelector(estimator)
    actual = assembler.assemble()

    expected = ast.BinNumExpr(
        ast.BinNumExpr(
            ast.NumVal(0.6),
            ast.IfExpr(
                ast.CompExpr(ast.FeatureRef(5), ast.NumVal(6.94099998),
                             ast.CompOpType.GTE), ast.NumVal(18.1008453),
                ast.NumVal(9.60167599)), ast.BinNumOpType.ADD),
        ast.IfExpr(
            ast.CompExpr(ast.FeatureRef(5), ast.NumVal(6.79699993),
                         ast.CompOpType.GTE), ast.NumVal(17.780262),
            ast.NumVal(9.51712894)), ast.BinNumOpType.ADD)

    assert utils.cmp_exprs(actual, expected)
Ejemplo n.º 10
0
        # XGBoost (tree method "hist")
        regression(xgboost.XGBRegressor(**XGBOOST_HIST_PARAMS),
                   test_fraction=0.2),
        classification(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
                       test_fraction=0.2),
        classification_binary(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
                              test_fraction=0.2),

        # XGBoost (LINEAR)
        regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_LINEAR)),
        classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)),
        classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)),

        # XGBoost (RF)
        regression(xgboost.XGBRFRegressor(**XGBOOST_PARAMS_RF)),
        classification(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)),
        classification_binary(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)),

        # XGBoost (Boosted Random Forests)
        regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_BOOSTED_RF)),
        classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)),
        classification_binary(
            xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)),

        # XGBoost (Large Trees)
        regression_random(xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)),
        classification_random(xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
        classification_binary_random(
            xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
column_name_list.remove('time_to_failure')
print(len(column_name_list))

feature_scaler = StandardScaler()
feature_df[column_name_list] = feature_scaler.fit_transform(feature_df[column_name_list])


# Initialize models

clf_ridg = Ridge(max_iter=5000)
clf_laso = Lasso(max_iter=5000)
clf_lala = LassoLars(max_iter=5000)
clf_enet = ElasticNet(max_iter=5000)

clf_xgbr = xgb.XGBRegressor()
clf_xgrf = xgb.XGBRFRegressor()

clf_rf = RandomForestRegressor(criterion='mae', max_features='sqrt')
clf_tree = ExtraTreesRegressor(criterion='mae', max_features='sqrt')
clf_ada = AdaBoostRegressor()
clf_grad = GradientBoostingRegressor()
clf_svr = SVR()


# Model parameters

# mae 2.160
param_ridg = {
    'alpha': [1, 10, 30, 100, 300, 1000], # 300
    'tol': [0.00001, 0.0000001, 0.000000001, 0.00000000001], # 1e-5
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], # sparse_cg
# In[46]:

spark.conf.set("spark.synapse.ml.predict.enabled", "true")

# ## Train and Save Model

# ### Training

# In[47]:

data = np.random.rand(5, 10)  # 5 entities, each contains 10 features
label = np.random.randint(1, size=5)  # binary target
dtrain = xgb.DMatrix(data, label=label)

xgr = xgb.XGBRFRegressor(objective='reg:linear', n_estimators=10, seed=123)
xgr.fit(data, label)

# In[48]:

xgr.save_model('./model.json')

# In[49]:

mlflow.pyfunc.save_model(data_path='./model.json',
                         path='./xgboost_pyfunc_model_path',
                         loader_module='mlflow.xgboost')

# In[50]:

MODEL_URI = './xgboost_pyfunc_model_path'
print(pca2.explained_variance_)
X_pca = pca2.transform(X_scaled)

pca_train = pd.DataFrame(data=X_pca,
                         columns=[
                             'principal component 1', 'principal component 2',
                             'principal component 3'
                         ])
print('Explained variation per principal component: {}'.format(
    pca2.explained_variance_ratio_))
#%%
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                    y,
                                                    test_size=.2,
                                                    random_state=42)
#%%
xg_without_tuning = xg.XGBRFRegressor(objective='reg:linear',
                                      n_estimators=10,
                                      seed=123)
xg_without_tuning.fit(X_train, y_train)
pred = xg_without_tuning.predict(X_test)
MAE = (mean_absolute_error(y_test, pred))
print("MAE: %f" % (MAE))

DM_train = xg.DMatrix(X_train, y_train)
DM_test = xg.DMatrix(X_test, y_test)
params = {'booster': 'gblinear', "objective": "reg:linear"}
xg_reg = xg.train(params=params, dtrain=DM_train, num_boost_round=5)
pred1 = xg_reg.predict(DM_test)
MAE1 = (mean_absolute_error(y_test, pred1))
print("RMSE1: %f" % MAE1)
Ejemplo n.º 14
0
ames_X_test.iloc[[0]]

ames_df.loc[[2661]]

shap.summary_plot(vals, ames_X_test)

shap.dependence_plot('Overall Qual', shap_values=vals, features=ames_X_test)

# ## XGBoost
# Powerful algorithm using "boosting" (like golfing) to predict target

dt = tree.DecisionTreeRegressor(max_depth=10)
dt.fit(auto_X_train, auto_y_train)
dt.score(auto_X_test, auto_y_test)

xg = xgb.XGBRFRegressor()
xg.fit(auto_X_train, auto_y_train)
xg.score(auto_X_test, auto_y_test)

xg

xgb.plot_importance(xg)

booster = xg.get_booster()
print(booster.get_dump()[0])

booster = xg.get_booster()
print(booster.get_dump()[1])

booster = xg.get_booster()
print(booster.get_dump()[-1])
Ejemplo n.º 15
0
from sklearn import ensemble
import xgboost as xgb
from sklearn import linear_model

MODELS = {
    "randomforest_classifier":
    ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2),
    "randomforest_regressor":
    ensemble.RandomForestRegressor(n_estimators=200, n_jobs=-1, verbose=2),
    "xgb_classifier":
    xgb.XGBRFClassifier(
        learning_rate=1,
        subsample=0.9,
    ),
    "xgb_regressor":
    xgb.XGBRFRegressor(learning_rate=1, subsample=0.9),
    "logistic_regressor":
    linear_model.LogisticRegression(
        penalty='elasticnet',
        fit_intercept=True,
        class_weight='balanced',
        random_state=42,
        solver='saga',
        verbose=2,
        n_jobs=-1,
    )

    #TODO: add more models here
}
Ejemplo n.º 16
0
y_data = pca.transform(y_data)
print(y_data.shape)

x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                    y_data,
                                                    train_size=0.8,
                                                    random_state=33)

print(y_test.shape)
print(y_train.shape)

# model = DecisionTreeRegressor(max_depth =4)                     # max_depth 몇 이상 올라가면 구분 잘 못함
# model = RandomForestRegressor(n_estimators = 200, max_depth=3)
# model = GradientBoostingRegressor()

model = xgb.XGBRFRegressor(eta=0.1, max_depth=5, colsample_bytree=0.5)
model.fit(x_train, y_train)
y_testpred = model.predict(x_test)
# y_test = pca.inverse_transform(y_test)
# y_testpred = pca.inverse_transform(y_testpred)
score = model.score(x_test, y_test)
print(score)
y4 = model.predict(x_prdeict)
print(y4.shape)
y4 = y4.reshape(y4.shape[0], 1)
y4 = pca.inverse_transform(y4)
y4 = scaler.inverse_transform(y4)
print(y_testpred.shape)
print(y4.shape)

# def tree_fit(y_train, y_test):


# ### Predictions 

# In[10]:

file_1 = "../data/Test.csv"
file_2 = "../data/additional_data/testRoot_edited.csv"

processor = DataProcessor(file_1, file_2, test = True, minimal = True)
x_test = processor.get_numpy_data(fillna = True, additional = True,
                                  encode = True, np_split = False, enocde_user = False,
                                  normalize = True, drop_ones = False)

#print(x_test.head())
# In[6]:

param_dist = {'objective':'reg:squarederror', 'n_estimators':1300, 'max_depth':9, 'min_child_weight': 49}
bst = xgb.XGBRFRegressor(**param_dist)
bst.fit(x_train, y_train.ravel(), eval_set=[(x_valid, y_valid)], verbose = True)

pr = bst.predict(x_test)
            
print(pr)

# In[ ]:

#test

Ejemplo n.º 18
0
df_test.head()

x = x.drop(['scaled_amount','scaled_time'],axis=1)
df_test = df_test.drop(['Time','Amount'],axis=1)

x

x.shape

y.shape

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x, y)
clf.feature_importances_

xb = xgb.XGBRFRegressor()
xb.fit(x, y)
xb.feature_importances_

def roc_curve_plots(y_test,y_predict_wrf,X_test,model):
    print(classification_report(y_test,y_predict_wrf),"\n")
    neigh_prob_linear=model.predict_proba(X_test)
    neigh_prob_linear1=neigh_prob_linear[:,1]
    fpr,tpr,thresh=roc_curve(y_test,neigh_prob_linear1)
    roc_auc_neigh=auc(fpr,tpr)

    plt.figure(dpi=80)
    plt.title("ROC Curve")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.plot(fpr,tpr,'b',label='AUC Score = %0.2f'%roc_auc_neigh)
test_id_idx = test_df.index

print('X_train : ', len(X_train))
print('X_val : ', len(X_val))
print('X_test : ', len(X_test))

#Kaggle에서 유행인 XGBoost 모델을 사용해서 훈련
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

param = {
    'n_estimators': range(550, 700, 50),
    'colsample_bytree': [0.5, 0.7, 1],
    'colsample_bylevel': [0.5, 0.7, 1],
}
model = xgb.XGBRFRegressor()
grid_search = GridSearchCV(estimator=model,
                           param_grid=param,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

#검증을 위하여 MSE 지표를 활용한다
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred_train = grid_search.predict(X_train)
pred_val = grid_search.predict(X_val)
Ejemplo n.º 20
0
x, y = run_it('Final_Data.csv', 7000000, 'CV')

#%%
# xgb_reg_model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=1,
#                                 learning_rate=0.4, max_depth=20,
#                                 alpha=10, n_estimators=20)

n_est = 20
max_depth = 20
alpha = 11
learning_rate = 0.4

xgb_reg_model = xgb.XGBRFRegressor(objective='reg:squarederror',
                                   colsample_bytree=1,
                                   min_child_weight=2,
                                   max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   tree_method='hist',
                                   n_estimators=n_est,
                                   alpha=alpha)

kfold = KFold(n_splits=5, shuffle=True, random_state=4)
kfold_scores = cross_val_score(xgb_reg_model,
                               x,
                               y,
                               scoring='neg_mean_squared_log_error',
                               cv=kfold)
kfold_scores = np.absolute(kfold_scores)
print(np.sqrt(kfold_scores.mean()))
# print("Beginning to Train the Model")
# start = time.time()
# xgb_reg_model.fit(x_train_pp, y_train)
Ejemplo n.º 21
0
    def optuna_tuner(self, lower_bound = .80 , upper_bound = 1.20 ):
        warnings.filterwarnings('ignore')
        epoch = self.epoch
        n_trials = self.n_trials
        params = self.params
        x_train = self.x_train
        x_test = self.x_test
        y_train = self.y_train
        y_test = self.y_test
        forrest = self.forrest

        '''
        Method Summary. This method uses Optuna's parameter tunning recurisvly to tune a XG Boost Regression model's hyper
        parameters. Optuna is more efficient then sklearns random grid as it prunes trees that are not promising to use
        more of its processing power on promising param combinations. This means that Optuna get better faster then sklearn's random
        or grid search
        epoch: This parameter moderates how many times the tunnner will cycle through a random search. Keep in mind that every epoch
                the parameters used are narrowed down by the random search run during the last epoch
        n_trials: this parameter dictates how many trials will be run each time a random search is called. Cross validataion is set to 5
                    so if n_trial was set to 20 each random search will actualy run through 100 trials 
        params: this parameter is the hyperparamter grid you want to initialy feed into the tunner.
                An example parameter grid for the xgboost regression looks like this.
                params = {
                    'objective' : ['reg:gamma'], 
                    'n_estimators' : range(50, 130,10), #500
                    'max_depth' : range(2,25),
                    'tree_method' :  ['auto', 'exact','approx', 'hist'],
                    'booster' :  ['gbtree', 'gblinear', 'dart'],
                    'sampling_method' : ['gradient_based'],
                    'reg_alpha' : [.05,.1,.15,.20,.25,.30],
                    'reg_lambda' : [0,.2,.4,.6,.8,1],
                    'learning_rate' : [.05,.08,.1,.15,.20],
                    'gamma' : [ 0.0, 0.1, 0.2],
                    'min_child_weight' : [ 1, 3, 5, 7],
                    'colsample_bytree': list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')),
                    'colsample_bylevel':list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')),
                    'colsample_bynode': list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')),
                    'importance_type' : ['gain', 'weight', 'cover', 'total_gain','total_cover']}

        x_train: The train features
        x_test: the test features
        y_train: the train targets
        y_test: the test targets



        Use this syntax to access the pandas data frame after tunning the model with either sklearn or optuna
                XGParis(**kwargs).best_params
        '''

        top_params = []
        all_params = pd.DataFrame(columns = ('scores', 'params'))
        def objective (trial: Trial, param_dic = params):
            new_params = {}
            for item in (param_dic):
                new_params[str(item)] = trial.suggest_categorical(str(item),list(param_dic[str(item)]))

            if forrest == False:
                xgb_r = xg.XGBRegressor(**new_params)
            else:
                xgb_r = xg.XGBRFRegressor(**new_params)
                
            xgb_r.fit(x_train,y_train)
            score = model_selection.cross_val_score(xgb_r, x_train, y_train, n_jobs=-1, cv=5)
            accuracy = score.mean()
            return accuracy

        study = optuna.create_study(direction='maximize',sampler=TPESampler())
        study.optimize(lambda trial : objective(trial),n_trials= n_trials)

        for item in study.trials:
            all_params.loc[len(all_params)] = (item.value, item.params)

            
        param_dic_random = study.best_trial.params
        top_params.append([study.best_trial.value,study.best_trial.params])
        
        counter = 2
        def repeater(epoch,counter, param_dic):
            def objective_2 (trial: Trial, param_dic = param_dic):
                new_params = {}
                for item in (param_dic):
                    if type(param_dic[str(item)]) == int:
                        new_params[str(item)] = trial.suggest_int(str(item), param_dic[str(item)]*lower_bound, 
                                                                             param_dic[str(item)]*upper_bound)
                    elif type(param_dic[str(item)]) == float:
                        if item == 'colsample_bytree' or item == 'colsample_bylevel' or item == 'colsample_bynode' :
                            if param_dic[str(item)]*1.25 >= 1:
                                new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, 1)
                            else:
                                new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, 
                                                                                     param_dic[str(item)]*upper_bound)
                        else:
                            new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, 
                                                                                     param_dic[str(item)]*upper_bound)
                    elif type(param_dic[str(item)]) == str:
                        new_params[str(item)] = trial.suggest_categorical(str(item), [(param_dic[str(item)])])

                    else:
                        print('error, skipped ' + str(item))
                        continue 
                #print(new_params)  
                if forrest == False:
                    xgb_r = xg.XGBRegressor(**new_params)
                else:
                    xgb_r = xg.XGBRFRegressor(**new_params)
                    
                xgb_r.fit(x_train,y_train)
                score = model_selection.cross_val_score(xgb_r, x_train, y_train, n_jobs=-1, cv=5)
                accuracy = score.mean()
                return accuracy

            study = optuna.create_study(direction='maximize',sampler=TPESampler())
            study.optimize(lambda trial : objective_2(trial),n_trials= n_trials)

            for item in study.trials:
                all_params.loc[len(all_params)] = (item.value, item.params)
            
            
            #print(study)
            if counter >= epoch:
                counter += 1
                #print('epoch - ' + str(counter) + ' Done')
                return study.best_trial.params
            else:
                counter += 1
                #print('epoch - ' + str(counter))
                top_params.append([study.best_trial.value,study.best_trial.params])
                return repeater(epoch,counter,study.best_trial.params)

        final_param = repeater(epoch,counter,param_dic_random)
        scores = []
        params_list = []
        for item in top_params:
            scores.append(item[0])
            params_list.append(item[1])
        top = pd.DataFrame(columns= ['scores','params'])
        top['scores'] = scores
        top['params'] = params_list

        if forrest == False:
            xgb_r = xg.XGBRegressor(seed = 123)
        else:
            xgb_r = xg.XGBRFRegressor(seed = 123)

        param_dic = top['params'][list(top['scores']).index(max(list(top['scores'])))]
        new_params = {}
        for item in param_dic:
            if type(param_dic[str(item)]) == str:
                new_params[str(item)] = params[str(item)]
            elif type(param_dic[str(item)]) == int:            
                new_params[str(item)] = [(param_dic[str(item)])]               
            elif type(param_dic[str(item)]) == float:
                new_params[str(item)] = [(param_dic[str(item)])]

        print(new_params)
            
        xgb_grid = GridSearchCV(estimator =xgb_r, param_grid = new_params, cv = 5, verbose=2, n_jobs = -1)
        xgb_grid.fit(x_train, y_train)

        top_params.append([(xgb_grid.best_score_),(xgb_grid.best_params_)])

        self.best_grid = top['params'][list(top['scores']).index(max(list(top['scores'])))]
        self.best_params = top
        self.all_params = all_params 
        self.optuna = True
Ejemplo n.º 22
0
#### restore Height prediction model (input : "Sex","age", model : XGBRegressor)
#### 키가 null인경우 null값을 추정하는 모형 사용
height_model = xgboost.XGBRegressor(max_depth=10, learning_rate=1, n_estimators=100)
height_model.load_model(os.path.join(RESULT,"XGBRegressor_Height.model"))

with open(os.path.join(RESULT,"scaler_Height.pkl"),'rb') as f:
    scaler = pickle.load(f)

#### fill null
data = scaler.transform(df.loc[df.Height.isnull(),["Sex","age"]])
pred = height_model.predict(data)
df.loc[df.Height.isnull(),"Height"] = (pred*patient_info['Height']['std'])+patient_info['Height']['mean']

#### restore Weight prediction model (input : "PatientHeight","PatientSex","age", model : XGBRFRegressor)
#### 몸무게가 null인경우 null값을 추정하는 모형 사용
weight_model = xgboost.XGBRFRegressor(max_depth=10, learning_rate=1, n_estimators=300)
weight_model.load_model(os.path.join(RESULT,"XGBRFRegressor_Weight.model"))

with open(os.path.join(RESULT,"scaler_Weight.pkl"),'rb') as f:
    scaler = pickle.load(f)

#### fill null
data = scaler.transform(df.loc[df.Weight.isnull(),["Height","PatientSex","age"]])
pred = weight_model.predict(data)
df.loc[df.Weight.isnull(),"Weight"] = (pred*patient_info['weight']['std'])+patient_info['weight']['mean']

df = df[df.age <= 192]
org_test = org_test[org_test.age <= 192]


df = labeling(df)
Ejemplo n.º 23
0
Archivo: test.py Proyecto: vic7894/vic
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
#%%
scores = make_scorer(mean_squared_error)
models = [
    RandomForestRegressor(n_estimators=200,
                          max_depth=3,
                          verbose=2,
                          random_state=42),
    GradientBoostingRegressor(random_state=42),
    lgb.LGBMRegressor(random_state=42),
    xgb.XGBRFRegressor(random_state=42)
]
model_mean = []
model_std = []
#%%
for i in models:
    cross_score = cross_val_score(i,
                                  X_clean,
                                  y,
                                  scoring=scores,
                                  n_jobs=-1,
                                  cv=4)
    cross_score = np.sqrt(cross_score)
    model_mean.append(np.mean(cross_score))
    model_std.append(np.std(cross_score))
model_results = pd.DataFrame({
from sklearn import tree
from sklearn import ensemble
from sklearn import linear_model
import xgboost as xgb

models = {
    "decision_tree_gini": tree.DecisionTreeRegressor(criterion='gini'),
    "decision_tree_entropy": tree.DecisionTreeRegressor(criterion="entropy"),
    "rf": ensemble.RandomForestRegressor(),
    "Linres": linear_model.LinearRegression(),
    "xgb_rf_reg": xgb.XGBRFRegressor(),
    "xgb_reg": xgb.XGBRegressor()
}
Ejemplo n.º 25
0
    def make_parameter_graph(self, parameter, test_range):
        '''
        This method uses optuna to make a parameter graph, which keeps all other paramters constant as the 
        paramter chosen varies. The param grid used is the best_param grid from the tunning process.
        
        parameter : this param is the paramter you would like a graph of
                    the graph is created using matplotlib and with the
                    parameter chosen on the x axis and the r^2 on the y axis
        
        test_range: This param is a list of all of the values you would like tested for your graph
        ex. list(range(0,10)) or [1,2,3,4,5,6,7,8,9] or [3,6,9]
        
        
        '''
        x_train = self.x_train
        x_test = self.x_test
        y_train = self.y_train
        y_test = self.y_test
        forrest = self.forrest
        params = self.params
        best_param = self.best_grid
        features = self.features 
        targets = self.targets
        split = self.split

        nums = []
        x = []
        for num in test_range:
            best_param[parameter] = num
            
            try:
                x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split,
                                                                     random_state = 24)
                if forrest == False:
                    xgb_r = xg.XGBRegressor(**best_param)
                else:
                    xgb_r = xg.XGBRFRegressor(**best_param)

                # Fitting the model 
                xgb_r.fit(x_train, y_train) 

                # Predict the model 
                pred = xgb_r.predict(x_test) 

                r2  = metrics.r2_score(y_test.values, pred)
                x.append(num)
                nums.append(r2)
                #print(num)

            except Exception as e:
                print(e)
                continue

            

                
        plt.plot(x,nums)
        plt.xlabel(parameter)
        plt.ylabel('R^2')
        
        plt.show()
Ejemplo n.º 26
0
model.fit(train_x_pca, train_y)
light_pre = model.predict(test_x_pca)
mae(light_pre, test_y)   # 1.5838890402706525  # 기존 mae값 보다 저하

model.feature_importances_



#####
import xgboost as xgb


lgb.LGBMRegressor(params)  # parameter값 확인
params1 = {'learning-rate' : 0.01,
           'max_depth' : 20,
           'boosting_type' : 'gbdt',
           'objective' : 'reg:linear',
           'metric' : 'mae',
           'is_training_metric' : True,
           'nem_leaves' : 144,
           'feature_fracton' : 0.9,
           'bagginf_fraction' : 0.7,
           'bagginf_freq' : 5,
           'seed' : 2020}

model = MultiOutputRegressor(xgb.XGBRFRegressor(**params1, random_state=0), n_jobs = -1)
model.fit(train_x, train_y)
preds = model.predict(test_x)
mae(preds, test_y)  # 1.211640226204764

Ejemplo n.º 27
0
def test_xgb_base_module(root_client: sy.VirtualMachineClient) -> None:

    sy.load("xgboost")
    sy.load("numpy")

    # third party
    import numpy as np
    import xgboost as xgb

    xgb_remote = root_client.xgboost

    # import xgboost as xgb

    X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
    y = np.array([0, 0, 1, 1])

    param = {"eta": 0.3, "max_depth": 3, "num_class": 3}

    steps = 20

    D_train = xgb.DMatrix(X, label=y)
    model = xgb.train(param, D_train, steps)
    preds = model.predict(D_train)

    D_train = xgb_remote.DMatrix(X, label=y)
    model = xgb_remote.train(param, D_train, steps)
    preds_remote = model.predict(D_train).get()

    classifier = xgb_remote.XGBClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_remote = classifier.predict(X).get()

    classifier = xgb.XGBClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier = classifier.predict(X)

    classifier = xgb_remote.XGBRFClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_rf_remote = classifier.predict(X).get()

    classifier = xgb.XGBRFClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_rf = classifier.predict(X)

    regressor = xgb.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
    regressor.fit(X, y)
    y_pred_regressor = regressor.predict(X)

    regressor = xgb_remote.XGBRegressor(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3
    )
    regressor.fit(X, y)
    y_pred_regressor_remote = regressor.predict(X).get()

    regressor = xgb.XGBRFRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
    regressor.fit(X, y)
    y_pred_regressor_rf = regressor.predict(X)

    regressor = xgb_remote.XGBRFRegressor(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3
    )
    regressor.fit(X, y)
    y_pred_regressor_rf_remote = regressor.predict(X).get()

    assert np.array_equal(y_pred_classifier_rf, y_pred_classifier_rf_remote)
    assert np.array_equal(y_pred_regressor_rf, y_pred_regressor_rf_remote)
    assert np.array_equal(y_pred_regressor, y_pred_regressor_remote)
    assert np.array_equal(y_pred_classifier, y_pred_classifier_remote)
    assert np.array_equal(preds_remote, preds)
Ejemplo n.º 28
0
    def sklearn_tuner(self,lower_bound = .90, upper_bound = 1.10):
        warnings.filterwarnings('ignore')
        epoch = self.epoch
        n_trials = self.n_trials
        params = self.params
        x_train = self.x_train
        x_test = self.x_test
        y_train = self.y_train
        y_test = self.y_test
        forrest = self.forrest

        '''
        Method Summary. This method uses sklearns random search grid recurisvly to tune a XG Boost Regression model's hyper
        parameters. Sklearns random search tends to crash if epoch and n_trials are too high. 
        epoch: This parameter moderates how many times the tunnner will cycle through a random search. Keep in mind that every epoch
                the parameters used are narrowed down by the random search run during the last epoch
        n_trials: this parameter dictates how many trials will be run each time a random search is called. Cross validataion is set to 5
                    so if n_trial was set to 20 each random search will actualy run through 100 trials 
        params: this parameter is the hyperparamter grid you want to initialy feed into the tunner.
                An example parameter grid for the xgboost regression looks like this.
                params = {
                        'objective' : ['reg:gamma'], 
                        'n_estimators' : range(50, 130,10), #500
                        'max_depth' : range(2,25),
                        'tree_method' :  ['auto', 'exact','approx', 'hist'],
                        'booster' :  ['gbtree', 'gblinear', 'dart'],
                        'sampling_method' : ['gradient_based'],
                        'reg_alpha' : [.05,.1,.15,.20,.25,.30],
                        'reg_lambda' : [0,.2,.4,.6,.8,1],
                        'learning_rate' : [.05,.08,.1,.15,.20],
                        'gamma' : [ 0.0, 0.1, 0.2],
                        'min_child_weight' : [ 1, 3, 5, 7],
                        'colsample_bytree' : [0,.2,0.3, 0.4,.6,.8,1],
                        #'colsample_bylevel':[0,.2,0.3, 0.4,.6,.8,1],
                        #'colsample_bynode': [0,.2,0.3, 0.4,.6,.8,1],
                        'importance_type' : ['gain', 'weight', 'cover', 'total_gain','total_cover']}

        features : model featres (x)
        targets : model target (y)

        forrest: Boolean. If you would like the model you are training to be a XGBoost Regression Tree then keep forrest = False (default)
        If you want to train and tune a forrest change forrest to True. Default False

        upper_bound: Every epoch the tunner takes the last epoch's best params and creates a new param grid with the chosen param*lower_bound as the lowest
        number in the new grid, and chosen param*upper_bound as the highest number in the new param_gird. Default is 1.10

        lower_bound: Every epoch the tunner takes the last epoch's best params and creates a new param grid with the chosen param*lower_bound as the lowest
        number in the new grid, and chosen param*upper_bound as the highest number in the new param_gird. Default is .90   

        split: train/test split. .20 is the same as an 80/20 split. Default 
        
        Use this syntax to access the pandas data frame after tunning the model with either sklearn or optuna
        XGParis(**kwargs).best_params
        '''

        def float_range(start, stop, step):
          while start < stop:
            yield float(start)
            start += decimal.Decimal(step)
            
        top_params = []
        if forrest == False:
            xgb_r = xg.XGBRegressor(seed = 123)
        else:
            xgb_r = xg.XGBRFRegressor(seed = 123) 
            
        xgb_random = RandomizedSearchCV(estimator =xgb_r, param_distributions = params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1)
        xgb_random.fit(x_train, y_train)

        param_random = (xgb_random.best_params_)
        top_params.append([(xgb_random.best_score_),(xgb_random.best_params_)])
        #print(param_random )
        counter = 2
        def repeater(epoch, counter, param_dic):
            new_params = {}
            for item in (param_dic):
                #print(item)
                if type(param_dic[str(item)]) == int:            
                    new_params[str(item)] = list(range(round(param_dic[str(item)]*lower_bound), round(param_dic[str(item)]*upper_bound)))
                    if new_params[str(item)] == []:
                        new_params[str(item)] =  [param_dic[str(item)]]                
                elif type(param_dic[str(item)]) == float:
                    if item == 'colsample_bytree' or item == 'colsample_bylevel' or item == 'colsample_bynode' :
                        if param_dic[str(item)]*1.10 >= 1:
                            new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*(upper_bound-lower_bound)), 1, '0.01')))
                        else:
                            new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), 
                                                                      decimal.Decimal(param_dic[str(item)]*upper_bound), '0.01')))
                    elif param_dic[str(item)] == 0.0:                    
                        new_params[str(item)] = (list(float_range(0, decimal.Decimal(param_dic[str(item)]*(upper_bound-lower_bound)), '0.01')))
                        if new_params[str(item)] == []:
                            new_params[str(item)] =  [param_dic[str(item)]] 
                            
                    elif item == 'learning_rate':
                        new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), 
                                                                  decimal.Decimal(param_dic[str(item)]*upper_bound), '0.001')))
                        
                    else:
                        new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), 
                                                                  decimal.Decimal(param_dic[str(item)]*upper_bound), '0.01')))
                elif type(param_dic[str(item)]) == str:
                    new_params[str(item)] = [(param_dic[str(item)])]

                else:
                    #print('error, skipped ' + str(item))
                    continue
            #print(new_params)       

            if forrest == False:
                xgb_r = xg.XGBRegressor(seed = 123)
            else:
                xgb_r = xg.XGBRFRegressor(seed = 123)
                
            xgb_random = RandomizedSearchCV(estimator =xgb_r, param_distributions = new_params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1)
            xgb_random.fit(x_train, y_train)

            
            if counter >= epoch:
                counter += 1
                #print('epoch - ' + str(counter) + ' Done')
                return xgb_random.best_params_
            else:
                counter += 1
                #print('epoch - ' + str(counter))
                top_params.append([(xgb_random.best_score_),(xgb_random.best_params_)])
                #print(xgb_random.best_score_)
                return repeater(epoch,counter,(xgb_random.best_params_))

       

        final_param = repeater(epoch,counter,param_random)
        scores = []
        params_list = []
        for item in top_params:
            scores.append(item[0])
            params_list.append(item[1])
        top = pd.DataFrame(columns= ['scores','params'])
        top['scores'] = scores
        top['params'] = params_list


        if forrest == False:
            xgb_r = xg.XGBRegressor(seed = 123)
        else:
            xgb_r = xg.XGBRFRegressor(seed = 123)

        param_dic = top['params'][list(top['scores']).index(max(list(top['scores'])))]
        new_params = {}
        for item in param_dic:
            if type(param_dic[str(item)]) == str:
                new_params[str(item)] = params[str(item)]
            elif type(param_dic[str(item)]) == int:            
                new_params[str(item)] = [(param_dic[str(item)])]               
            elif type(param_dic[str(item)]) == float:
                new_params[str(item)] = [(param_dic[str(item)])]
            
        xgb_grid = GridSearchCV(estimator =xgb_r, param_distributions = new_params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1)
        xgb_grid.fit(x_train, y_train)

        top_params.append([(xgb_grid.best_score_),(xgb_grid.best_params_)])

        self.best_grid = top['params'][list(top['scores']).index(max(list(top['scores'])))]
        self.best_params = top
        self.optuna = False
feature_scaler = StandardScaler()
feature_df[feat_column_name_list] = feature_scaler.fit_transform(
    feature_df[feat_column_name_list])
test_x[test_column_name_list] = feature_scaler.transform(
    test_x[test_column_name_list])

# Initialize models

clf_line = LinearRegression()
clf_ridg = Ridge(alpha=300, tol=1e-05, solver='sparse_cg', max_iter=5000)
clf_laso = Lasso(alpha=0.1, tol=1e-05, max_iter=5000)
clf_lala = LassoLars(alpha=0.001, max_iter=5000)
clf_enet = ElasticNet(alpha=0.1, tol=0.001, l1_ratio=0.2, max_iter=5000)

clf_xgbr = xgb.XGBRegressor()  # not yet
clf_xgrf = xgb.XGBRFRegressor()  # not yet

clf_rf = RandomForestRegressor(criterion='mae',
                               max_features='sqrt',
                               n_estimators=200,
                               max_depth=10)
clf_tree = ExtraTreesRegressor(criterion='mae',
                               max_features='sqrt',
                               n_estimators=200,
                               max_depth=10)
clf_ada = AdaBoostRegressor(n_estimators=3, loss='linear')
clf_grad = GradientBoostingRegressor()  # not yet
clf_svr = SVR(kernel='rbf', C=0.1)

base_model_name = [
    'LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'XgbReg',
Ejemplo n.º 30
0
#Random Forest
print("Random Forest")
print(
    run_model(
        RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0),
        cat_feats))

#XGBoost
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    #'learning_rate': 0.1,
    'seed': 0
}

model = xgb.XGBRFRegressor(**xgb_params)
print("XGBoost")
print(run_model(model, cat_feats))

#most influential features
X = df[cat_feats].values
Y = df['price_value'].values
m = xgb.XGBRFRegressor(max_depth=5, n_estimators=50, learning_rate=0.1, seed=0)
m.fit(X, Y)

imp = PermutationImportance(m, random_state=0).fit(X, Y)
print(eli5.show_weights(imp, feature_names=cat_feats).data)

#from above code come out the most infuential features:
feats = [
    'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat',