def XGB(self, x_train, y_train, x_test, y_test):
     x_train, y_train = shuffle(x_train, y_train)
     xgb = XGBRegressor(max_depth=4, subsample=0.9)
     xgb.fit(x_train,y_train)
     y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1)
     loss = mean_squared_error(y_pred, y_test)
     print loss
     return y_pred, loss
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
#                       basinhopping=True,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self
Exemple #3
0
def Stacking(real_train_tar):
    predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T
    sns.pairplot(predictions_train)
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    stack_model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_stack.fit(predictions_train, real_train_tar)
    xgb_stack.best_params_
    write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl')
    
    model_stacking = XGBRegressor(**xgb_stack.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_stacking.fit(predictions_train,real_train_tar)
    end=time.time()
    print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train))))
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    y_stack_predict=model_stacking.predict(predictions_train)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,y_stack_predict)
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
Exemple #4
0
class HousePricePredictor(BaseModel):
    def __init__(self):
        self.model = XGBRegressor()

    def predict(self, X):
        X = self._prepare_data(X)
        return self.model.predict(X)

    def _prepare_data(self, X):
        return pd.DataFrame(X, columns=FEATURES)

    def fit(self, X, y):
        model = XGBRegressor()
        clf = GridSearchCV(
            model,
            {
                'max_depth': [6, ],
                'learning_rate': [0.05, ],
                'n_estimators': [450, 470, 475, 480, 485, ]
            },
            n_jobs=4,
            cv=3,
            verbose=1
        )
        clf.fit(X, y)
        logging.info("Best Score: {}".format(clf.best_score_))
        logging.info("Best Params: {}".format(clf.best_params_))
        self.model = clf.best_estimator_

        return self.model

    def dump(self, path):
        self.model.save_model(path)

    @classmethod
    def load(cls, path):
        house_model = HousePricePredictor()
        house_model.model.load_model(path)

        return house_model
Exemple #5
0
        cur_score = get_score(clf, X_test, Y_test, features_to_keep)
        print 'Cur score:', cur_score
    features_to_keep_folds.append(save_if_good)
    print '-' * 30
selected_features = set.intersection(*[set(i) for i in features_to_keep_folds])
print len(selected_features)

print 'TUNING HYPERPARAMS...'
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)
params = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 300, 500],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.5, 1]
}
grid = GridSearchCV(XGBRegressor(seed=0),
                    params,
                    cv=5,
                    scoring=rmsle_scorer,
                    verbose=5)
grid.fit(X[list(selected_features)], Y)
means = grid.cv_results_['mean_test_score']

stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.6f (+/-%0.03f) for %r" % (mean, std * 2, params))
print 'Best Params:', grid.best_params_

# In[80]:

#tuning ridge regression hyperparameters
model2.fit(train[col], np.log1p(train['visitors'].values))
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                                                model1.predict(train[col])))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                                          model2.predict(train[col])))
#test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2
test['visitors'] = model2.predict(test[col])
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
#del train; del data;

sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'),
                                index = False)

from xgboost import XGBRegressor
model3 = XGBRegressor()
model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False)
print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                              model3.predict(train[col])))

## from hklee
## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
#dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
#    pd.read_csv(fn)for fn in glob.glob('../input/*.csv')}
#
#for k, v in dfs.items(): locals()[k] = v
#
#wkend_holidays = date_info.apply(
#    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
#date_info.loc[wkend_holidays, 'holiday_flg'] = 0
#date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  
def models():

    extra_params_kaggle_cla = {
        'n_estimators': 1200,
        'max_features': 30,
        'criterion': 'entropy',
        'min_samples_leaf': 2,
        'min_samples_split': 2,
        'max_depth': 30,
        'min_samples_leaf': 2,
        'n_jobs': nthread,
        'random_state': seed
    }

    extra_params_kaggle_reg = {
        'n_estimators': 1200,
        'max_features': 30,
        'criterion': 'mse',
        'min_samples_leaf': 2,
        'min_samples_split': 2,
        'max_depth': 30,
        'min_samples_leaf': 2,
        'n_jobs': nthread,
        'random_state': seed
    }

    xgb_reg = {
        'objective': 'reg:linear',
        'max_depth': 11,
        'learning_rate': 0.01,
        'subsample': .9,
        'n_estimators': 10000,
        'colsample_bytree': 0.45,
        'nthread': nthread,
        'seed': seed
    }

    xgb_cla = {
        'objective': 'binary:logistic',
        'max_depth': 11,
        'learning_rate': 0.01,
        'subsample': .9,
        'n_estimators': 10000,
        'colsample_bytree': 0.45,
        'nthread': nthread,
        'seed': seed
    }

    #NN params
    nb_epoch = 3
    batch_size = 128
    esr = 402

    param1 = {
        'hidden_units': (256, 256),
        'activation': (advanced_activations.PReLU(),
                       advanced_activations.PReLU(), core.activations.sigmoid),
        'dropout': (0., 0.),
        'optimizer':
        RMSprop(),
        'nb_epoch':
        nb_epoch,
    }
    param2 = {
        'hidden_units': (1024, 1024),
        'activation': (advanced_activations.PReLU(),
                       advanced_activations.PReLU(), core.activations.sigmoid),
        'dropout': (0., 0.),
        'optimizer':
        RMSprop(),
        'nb_epoch':
        nb_epoch,
    }
    clfs = [
        (D2, XGBClassifier(**xgb_cla)),
        (D11, XGBClassifier(**xgb_cla)),
        (D2, XGBRegressor(**xgb_reg)),
        (D11, XGBRegressor(**xgb_reg)),
        (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
        (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
        (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
        (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),

        # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
        # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
        # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
        #
        # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
        # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
        # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))
    ]
    for clf in clfs:
        yield clf
Exemple #8
0
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_lag_6', 'item_cnt_month_lag_12', 'month', 'days',
    'item_shop_first_sale', 'item_first_sale'
]]

X_train = data[data.month_idx < 21].drop(['item_cnt_month'], axis=1)
y_train = data[data.month_idx < 21]['item_cnt_month']
X_valid = data[data.month_idx == 21].drop(['item_cnt_month'], axis=1)
y_valid = data[data.month_idx == 21]['item_cnt_month']
X_test = data[data.month_idx == 22].drop(['item_cnt_month'], axis=1)

ts = time.time()

model = XGBRegressor(max_depth=8,
                     n_estimators=1000,
                     min_child_weight=300,
                     colsample_bytree=0.8,
                     eta=0.3,
                     seed=42)

model.fit(X_train,
          y_train,
          eval_metric="rmse",
          eval_set=[(X_train, y_train), (X_valid, y_valid)],
          verbose=True,
          early_stopping_rounds=10)

time.time() - ts

y_pred = model.predict(X_valid).clip(0, 20)
y_test = model.predict(X_test).clip(0, 20)
df_valid = df_valid.set_index('key_0')

typ = df_valid.dtypes
df_valid.to_csv('df_valid_cat.csv', header=None, index=False)

df_train.columns
RFR = RandomForestRegressor()
RFR.fit(X_train, Y_train)

RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test, RFR_preds))

RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

XGB = XGBRegressor()
XGB.fit(X_train, Y_train, verbose=False)
XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int)
print(mean_absolute_error(Y_test,XGB_preds))

XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

GBR = GradientBoostingRegressor()
GBR.fit(X_train, Y_train)
GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test,GBR_preds))

GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test)
from sklearn.model_selection import KFold
Exemple #10
0
import matplotlib.pyplot as plt
import numpy as np

# 회귀 모델
x, y = load_boston(return_X_y=True)

print(x.shape)
print(y.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1)

model.fit(x_train, y_train)

threshold = np.sort(model.feature_importances_)

for thres in threshold:
    selection = SelectFromModel(model, threshold=thres, prefit=True)

    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)

    selection_model = XGBRegressor(n_estimators=100,
                                   learning_rate=0.05,
                                   n_jobs=-1)
Exemple #11
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures as pf
from sklearn import linear_model as lm

train = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\train.csv')
test = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\test.csv')
train = train.drop(columns=['Index', 'District'])
test = test.drop(columns=['Index', 'District'])
base = [
    RandomForestRegressor(n_estimators=100, max_depth=10),
    ExtraTreesRegressor(n_estimators=90, max_depth=15),
    GradientBoostingRegressor(n_estimators=60, max_depth=5),
    XGBRegressor(n_estimators=50, max_depth=5),
    BaggingRegressor(n_estimators=50, base_estimator=lm.LinearRegression())
]
name = ['RFR', 'ETR', 'GBR', 'XGBR', 'BAR']
df1 = pd.DataFrame()
c = 0
train_x = train.drop(columns=['Rainfall'])
train_y = train['Rainfall']
test_x = test.drop(columns=['Rainfall'])
test_y = test['Rainfall']
d1 = {}
for i, j in zip(base, name):
    print(j, c)
    if j == 'BAR':
        poly = pf(degree=4)
        train_x = poly.fit_transform(train_x)
def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

Y_train = np.log1p(train_df['price_doc'].values)
X_train = train_df.ix[:, train_df.columns != 'price_doc'].values
X_test = test_df.values

################################## XGBRegressor ###############################

#Initialize Model
xgb = XGBRegressor()

#Create cross-validation
cv = TimeSeriesSplit(n_splits=5)
#Train & Test Model
cross_val_results = cross_val_score(xgb, X_train, Y_train, cv=cv, scoring='neg_mean_squared_error')
print(cross_val_results.mean())


model = xgb.fit(X_train, Y_train)
# model.feature_importances_;

from xgboost import XGBRegressor

#Get Data
Y_train = train_df['price_doc'].values
Exemple #13
0
    if df[heads[i]].dtypes == 'O':
        df[heads[i]] = lb_make.fit_transform(df[heads[i]].astype(str))

#extracting input and output features
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
# prepare configuration for cross validation test harness

seed = 7

# prepare models

#model evaluation
models = []

models.append(('XGBoost', XGBRegressor()))
models.append(('GBR',
               ensemble.GradientBoostingRegressor(loss='quantile',
                                                  alpha=0.1,
                                                  n_estimators=250,
                                                  max_depth=3,
                                                  learning_rate=.1,
                                                  min_samples_leaf=9,
                                                  min_samples_split=9)))

models.append(('RFR', RandomForestRegressor()))

# evaluate each model in turn

results = []
names = []
Exemple #14
0
X_train = tr_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True)
X_test = ts_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True)
y_train = tr_user["loan_sum"].reset_index(drop=True)


# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
# Initialize 1-st level models.
models = [
    ExtraTreesRegressor(random_state = 0, n_jobs = -1, 
        n_estimators = 300, max_depth = 3),
        
    RandomForestRegressor(random_state = 0, n_jobs = -1, 
        n_estimators = 300, max_depth = 3),
        
    XGBRegressor(seed = 0, learning_rate = 0.05, 
        n_estimators = 300, max_depth = 3),

    LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300)
    ]
    
# Compute stacking features

S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True, metric = mean_squared_error, n_folds = 5, shuffle = True, random_state = 0, verbose = 2)

  
# Fit 2-nd level model
model =  LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)

id_test = ts_user['uid']
Exemple #15
0
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-16.688023353137517
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.01,
                                             max_depth=1,
                                             min_child_weight=16,
                                             n_estimators=100,
                                             nthread=1,
                                             subsample=0.55)),
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.9,
                                            learning_rate=0.001,
                                            loss="ls",
                                            max_depth=2,
                                            max_features=0.7500000000000001,
                                            min_samples_leaf=12,
                                            min_samples_split=17,
                                            n_estimators=100,
                                            subsample=1.0)),
    Nystroem(gamma=0.25, kernel="laplacian", n_components=10),
    GradientBoostingRegressor(alpha=0.95,
                              learning_rate=0.1,
Exemple #16
0
x_data = train.iloc[:, :71]
y_data = train.iloc[:, -4:]

x_data = x_data.fillna(x_data.mean())
test = test.fillna(test.mean())

x = x_data.values
y = y_data.values
x_pred = test.values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=33)

model = MultiOutputRegressor(XGBRegressor())

model.fit(x_train, y_train)

y_pred1 = model.predict(x_test)

print('mae: ', mean_absolute_error(y_test, y_pred1))


## feature_importances
def plot_feature_importances(model):
    plt.figure(figsize=(10, 40))
    n_features = x_data.shape[1]  # n_features = column개수
    plt.barh(
        np.arange(n_features),
        model.feature_importances_,  # barh : 가로방향 bar chart
Exemple #17
0
rf_grid = my_search.predict(X_trans)
rf_grid_rmsle = RMSLe_(y_train_trans, rf_grid)

output = output.append(
    {
        "model": "RF grid search (max_deth 8)",
        "R2 mean": ranked_res1["mean_test_score"][4],
        "R2 std": ranked_res1["std_test_score"][4],
        "RMSLE": rf_grid_rmsle
    },
    ignore_index=True)

# Gradient Boosting

xgb = XGBRegressor(n_estimators=50,
                   max_depth=5,
                   learning_rate=0.1,
                   random_state=42)

xgb.fit(X_trans, y_train_trans)

cross_val_xgb = cross_val_score(xgb, X_trans, y_train_trans, cv=5)
pred_xgb = xgb.predict(X_trans)
rmlse_xgb = RMSLe_(y_train_trans, pred_xgb)

output = output.append(
    {
        "model": "GB 0.1 ",
        "R2 mean": cross_val_xgb.mean(),
        "R2 std": cross_val_xgb.std(),
        "RMSLE": rmlse_xgb
    },
        # 'min_child_weight': np.linspace(200, 250, 5, dtype='int32'),
        ### Third param tunning
        # 'gamma': np.linspace(0.0, 0.5, 5),
        ### Fourth param tunning
        # 'subsample': np.linspace(0.6, 0.9, 4),
        # 'colsample_bytree': np.linspace(0.6, 0.9, 4),
        ### Fifith param tunning
        # 'reg_alpha': np.linspace(1e-5, 100, 5)
    }

    xgbr = XGBRegressor(
        nthread=25,
        seed=42,
        learning_rate=0.02,
        n_estimators=3000,
        max_depth=11,
        min_child_weight=225,
        gamma=0.125,
        colsample_bytree=0.6,
        subsample=0.9,
        reg_alpha=25,
    )
    """
    fit_params_xgb = {'eval_metric': 'rmse',
                      'early_stopping_rounds': 30,
                      'verbose': False,
                      'eval_set': [(X_test, y_test)],
                      }

    bag = BaggingRegressor(xgbr,
                           n_estimators=5,
                           max_samples=0.85,
class PrudentialRegressorFO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
#                       basinhopping=True,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
for param in params:
    clf = XGBRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
print test_scores
plt.plot(params, test_scores)
plt.title("n_estimators vs CV Error");
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
# 将当前figure的图保存到文件result.png
#plt.savefig('./xgboostparams.png')
'''

# XGBRegressor 91 16889
print "XGBRegressor"  
xgb = XGBRegressor(max_depth=6,n_estimators=400)
xgb.fit(X, y)
print mean_absolute_error(val_y,xgb.predict(val_x))
print mean_squared_error(val_y,xgb.predict(val_x))

#gbdt

print "GradientBoostingRegressor"    
gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400)
gbdt.fit(X, y)#17083
#RandomForestRegressor 93  16938
#GradientBoostingRegressor 90 16866
print mean_absolute_error(val_y,gbdt.predict(val_x))
print mean_squared_error(val_y,gbdt.predict(val_x))

#xgb & gbdt
def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test):
    clfs = list()
    cvClfs = list()

    print "Building RF1"
    rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShortCV.fit(cvX_train, cvy_train);
    print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True);
    rfShort.fit(X_train,y_train)
    cvClfs.append(rfShortCV)
    clfs.append(rfShort)
    pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True);

    print "Building SVM"
    clfSVRCV = SVR(C=10.0)
    clfSVR = SVR(C=10.0)
    clfSVRCV.fit(cvX_train, cvy_train);
    print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True);

    print "Building RF2"
    rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLongCV.fit(cvX_train, cvy_train);
    print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test))
    rfLong.fit(X_train,y_train)
    cvClfs.append(rfLongCV)
    clfs.append(rfLong)
    pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True);


    print "Building GB1"
    regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV1.fit(cvX_train, cvy_train);
    print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test))
    regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB1.fit(X_train,y_train)
    cvClfs.append(regGBCV1)
    clfs.append(regGB1)
    pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True);


    print 'Building GB2'
    regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV2.fit(cvX_train, cvy_train);
    print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test))
    regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB2.fit(X_train,y_train)
    cvClfs.append(regGBCV2)
    clfs.append(regGB2)
    pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True);


    print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True);

    print "Building XGB1"
    xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgbCV1.fit(cvX_train, cvy_train);
    xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgb1.fit(X_train,y_train);
    print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test));
    cvClfs.append(xgbCV1)
    clfs.append(xgb1)
    pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True);



    print "Building XGB2"
    params = {}
    params["objective"] = "reg:linear"
    params["learning_rate"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.75
    params["silent"] = 1
    params["max_depth"] = 7
    params["n_estimators"] = 3000
    params['gamma'] = 1.25
    params['nthread'] = -1
    print 'XGBoost Training Process Started'
    xgbCV2 = XGBRegressor(**params);
    xgbCV2.fit(cvX_train, cvy_train);
    print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test));
    xgb2 = XGBRegressor(**params);
    xgb2.fit(X_train,y_train);
    cvClfs.append(xgbCV2)
    clfs.append(xgb2)
    pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True);


    # Return the cross validated models and the actual fitted models separately.
    return [clfs,cvClfs];
###################################################################

# XGBoost Model Building

###################################################################
"""
I will build and test the models on Y2, since this is where the maximum
improvement can be made to improve performance.

Benchmark - MSE of 0.056
"""

# Basic Model Building - Ch.4

xgb_1 = XGBRegressor()
xgb_1.fit(X_train, Y_train)
y_pred = xgb_1.predict(X_test)
predictions = [round(value) for value in y_pred]
MSE_1 = mean_squared_error(Y_test, predictions)
print("MSE is " + str(MSE_1))

plot_tree(xgb_1)

# Model Using KFold Cross Validation

xgb_2 = XGBRegressor()
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(xgb_2, X_train, Y_train, cv=kfold)
xgb_2.fit(X_train, Y_train)
Y_pred = xgb_2.predict(X_test)
Exemple #23
0
def get_fitted_clf(X_train, Y_train, features):
    clf = XGBRegressor(seed=0)
    clf.fit(X_train[features], Y_train)
    return clf
Exemple #24
0
train=train.drop(['total_sales','outlet_no'],1)
outlet=test.outlet_no
test=test.drop('outlet_no',1)


# In[199]:

from xgboost import XGBRegressor
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV


# In[209]:

model = XGBRegressor()
learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
n_estimators=[100,200,300,400,500]
param_grid = dict(learning_rate=learning_rate,n_estimators=n_estimators)
kfold = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="mean_absolute_error", n_jobs=-1, cv=kfold)


# In[210]:

result = grid_search.fit(train,y)
# summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))


# In[211]:
Exemple #25
0
    'max_depth': (
        2,
        6,
    ),  # default 3
    'n_estimators': (
        50,
        100,
        150,
    ),  # default 100
    'subsample': (
        0.6,
        0.4,
    ),
}]

est = XGBRegressor(random_state=69)
gs = GridSearchCV(est,
                  cv=10,
                  param_grid=hyper_params,
                  verbose=2,
                  n_jobs=n_jobs,
                  scoring='r2')

#params = {
#    "colsample_bytree": uniform(0.7, 0.3),
#    "gamma": uniform(0, 0.5),
#    "learning_rate": uniform(0.03, 0.3), # default 0.1
#    "max_depth": randint(2, 6), # default 3
#    "n_estimators": randint(100, 150), # default 100
#    "subsample": uniform(0.6, 0.4)
#}
Exemple #26
0
model_GBoost = GradientBoostingRegressor(n_estimators=2000,
                                         learning_rate=0.03,
                                         max_depth=3,
                                         max_features=0.4,
                                         min_samples_leaf=20,
                                         min_samples_split=10,
                                         loss='huber',
                                         random_state=seed)

model_xgb = XGBRegressor(colsample_bytree=0.35,
                         gamma=0.027,
                         learning_rate=0.03,
                         max_depth=4,
                         min_child_weight=1.7817,
                         n_estimators=3000,
                         reg_alpha=0.43,
                         reg_lambda=0.88,
                         subsample=0.5213,
                         silent=1,
                         random_state=seed)

model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=10,
                              learning_rate=0.03,
                              n_estimators=720,
                              max_bin=55,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              feature_fraction=0.2319,
                              feature_fraction_seed=9,
Exemple #27
0
    {
        "n_estimators": [90, 100, 110],
        "learning_rate": [0.001, 0.01, 0.1],
        "max_depth": [4, 5, 6],
        "colsample_bytree": [0.6, 0.9, 1]
    },
    {
        "n_estimators": [90, 110],
        "learning_rate": [0.001, 0.1, 0.5],
        "max_depth": [4, 5, 6],
        "colsample_bytree": [0.6, 0.9, 1],
        "colsample_bylevel": [0.6, 0.7, 0.9]
    },
]

model = GridSearchCV(XGBRegressor(), parameters, cv=kfold)

model.fit(x_train, y_train)

print('최적의 매개변수 :', model.best_estimator_)

y_pred = model.predict(x_test)
print('최종 정답률 :', r2_score(y_test, y_pred))
print('최종 정답률 :', model.score(x_test, y_test))
'''
최적의 매개변수 : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=90, n_jobs=8, num_parallel_tree=1, random_state=0,
x, y = load_boston(return_X_y=True)  # 사이킷런에서 자동으로 x, y 부여

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

parameter = [{
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.3, 0.001, 0.01],
    'max_depth': [4, 5, 6]
}]

model = RandomizedSearchCV(XGBRegressor(n_jobs=8), parameter)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('r2: ', score)

thresholds = np.sort(model.best_estimator_.feature_importances_
                     )  # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬)
# print(model.best_estimator_.feature_importances_)
print(thresholds)  # 이 값들 모두 합치면 1 (컬럼 13개)
# r2:  0.9188116974777065
#  0.02678531 0.03278282 0.03606399 0.04534625 0.05393368 0.27339098
#  0.4654915 ]

model = model.best_estimator_  # 이미 모델안에 best_estimator_ 있으므로 XGB 안해도된다
Exemple #29
0
                                                        shuffle=False)

    X_train, X_mean, X_std = normalize(X_train)
    X_test = normalize_test(X_test, X_mean, X_std)

    y_train, y_mean, y_std = normalize(y_train)
    # y_test = normalize_test(y_test, y_mean, y_std)

    # ==============
    # MODEL CREATION
    # ==============

    svr_model = SVR()
    rf_model = RandomForestRegressor(n_estimators=100)
    adb_model = AdaBoostRegressor(n_estimators=100)
    xgb_model = XGBRegressor()

    svr_model.fit(X_train, y_train)
    joblib.dump(
        svr_model,
        path + 'models/' + str(data_interval) + 'min/svr_' + stock + '.pkl')
    # svr_model = joblib.load(path+'models/'+str(data_interval)+'min/svr_'+stock+'.pkl')

    rf_model.fit(X_train, y_train)
    joblib.dump(
        rf_model,
        path + 'models/' + str(data_interval) + 'min/rf_' + stock + '.pkl')
    # rf_model = joblib.load(path+'models/'+str(data_interval)+'min/rf_'+stock+'.pkl')

    adb_model.fit(X_train, y_train)
    joblib.dump(
Exemple #30
0
    def __init__(self):
        self.classifier_param_list = [
            {
                "model": [DecisionTreeClassifier()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [RandomForestClassifier()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [MLPClassifier()],
                "model__activation": ["identity", "logistic", "tanh", "relu"],
                "model__alpha": [0.001, 0.01, 0.1],
            },
            {
                "model": [LogisticRegression(fit_intercept=False)],
                "model__C": [1, 5, 10],
            },
            {
                "model": [BaggingClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__max_features": [0.25, 0.5, 1.0],
            },
            {
                "model": [AdaBoostClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [XGBClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [lgb.LGBMClassifier()],
                "model__learning_rate": [0.01],
            },
            {
                "model": [CatBoostClassifier()],
                "model__learning_rate": [0.01],
            },
        ]

        self.regressor_param_list = [
            {
                "model": [DecisionTreeRegressor()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [RandomForestRegressor()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [MLPRegressor()],
                "model__activation": ["identity", "logistic", "tanh", "relu"],
                "model__alpha": [0.001, 0.01, 0.1],
            },
            {
                "model": [ElasticNet(fit_intercept=False)],
                "model__alpha": [0.001, 0.01, 0.1],
                "model__l1_ratio": [0.25, 0.5, 1.0],
            },
            {
                "model": [BaggingRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__max_features": [0.25, 0.5, 1.0],
            },
            {
                "model": [AdaBoostRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [XGBRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [lgb.LGBMRegressor()],
                "model__learning_rate": [0.01],
            },
            {
                "model": [CatBoostRegressor()],
                "model__learning_rate": [0.01],
            },
        ]
Exemple #31
0
lgbm_parameter = [
    {
        'n_estimators': [10000],
        'learning_rate': [0.001, 0.01, 0.0025, 0.075]
    },
]
lgbm_fit_params = {
    'verbose': False,
    'eval_metric': ["logloss", "rmse"],
    'eval_set': [(x_train, y_train), (x_test, y_test)],
    'early_stopping_rounds': 20
}

#### XGB 셀렉트
start1 = time.time()
model_XGB = XGBRegressor()
model_XGB.fit(x_train, y_train)
score = model_XGB.score(x_test, y_test)
print("r2 : ", score)

thresholds = np.sort(model_XGB.feature_importances_)

print(thresholds)
print(x_train.shape)
print("========================")

best_x_train = x_train
best_x_train = x_test
best_score = score
best_model = model_XGB
	mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y)
	print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea))

'''
# clf = XGBRegressor() 17165
# XGBRegressor(n_estimators=400)  16330
'''
params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500]
test_scores = []
for param in params:
    clf = XGBRegressor(n_estimators=400,learning_rate=param)
    test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error" + str(params));
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
'''

my_model = XGBRegressor(n_estimators=400)
my_model.fit(train_X, train_y,verbose=False)
predictions = my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

#save model
#joblib.dump(melbourne_model,'model.pickle')

#load model
#model = joblib.load('model.pickle')

Exemple #33
0
sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)

grid_params = {
    'booster': 'gbtree',
    'objective': 'reg:linear',
    'learning_rate': 0.05,
    'max_depth': 10,
    'gamma': 0,
    'min_child_weight': 1,
    'grow_policy': 'lossguide',
    'silent': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'n_estimators': 100,
    'tree_method': 'gpu_exact',
}

estimator = XGBRegressor(**grid_params)
estimator.fit(X_train, Y_train)

Y_predict = estimator.predict(X_test)
final_score = RMSLE(Y_predict, Y_test)
print('Scorul final pe teste(RMLSE): ')
print(final_score)
Exemple #34
0
class Blending(BaseEnsembleModel):
    def __init__(self,
                 stats,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='xgboost'):
        super().__init__(stats=stats,
                         ensemble_method='blending',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)
        try:
            from xgboost import XGBClassifier
        except:
            warnings.warn(
                "Xgboost is not imported! Blending will use linear model instead!"
            )
            meta_learner = 'linear'

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)

    def fit(self, data):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for idx in range(len(train_list)):
                X, y = train_list[idx].data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X,
                        y,
                        test_size=test_size,
                        stratify=data.data[1],
                        random_state=self.seed)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X, y, test_size=test_size, random_state=self.seed)
                for _config in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        estimator = fetch_predict_estimator(
                            self.task_type, _config, x_p1, y_p1)
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-blending-model%d' %
                                    (self.timestamp, model_cnt)), 'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(x_p2)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(x_p2)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                        suc_cnt += 1
                    model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self

    def get_feature(self, data, solvers):
        # Predict the labels via blending
        feature_p2 = None
        model_cnt = 0
        suc_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for train_node in train_list:
                test_node = solvers[algo_id].optimizer['fe'].apply(
                    data, train_node)
                for _ in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-blending-model%d' %
                                    (self.timestamp, model_cnt)), 'rb') as f:
                            estimator = pkl.load(f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(test_node.data[0])
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(data.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred
                        else:
                            pred = estimator.predict(
                                test_node.data[0]).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(data.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                        suc_cnt += 1
                    model_cnt += 1

        return feature_p2

    def predict(self, data, solvers):
        feature_p2 = self.get_feature(data, solvers)
        # Get predictions from meta-learner
        if self.task_type in CLS_TASKS:
            final_pred = self.meta_learner.predict_proba(feature_p2)
        else:
            final_pred = self.meta_learner.predict(feature_p2)
        return final_pred
                                   verbose=1)
    grid_search_obj.fit(x_train, y_train)

    print('The following is the best parameter setting for this problem:')
    print(grid_search_obj.best_params_)

    print('Training score on the best estimator: {}'.format(
        grid_search_obj.best_score_))

    return grid_search_obj.best_estimator_


if __name__ == '__main__':

    import pandas as pd
    from xgboost import XGBRegressor

    # Read data from a file and split into data and labels.
    path_to_file = 'OnlineNewsPopularity/OnlineNewsPopularity.csv'
    data = pd.read_csv(path_to_file, header=0).drop('url', axis=1)
    labels = pd.Series(data.pop(' shares'))

    # Split dataset into train and test sets.
    x_train, x_test, y_train, y_test = split_data(data, labels)

    # Create an XGBRegressor object
    clf = XGBRegressor(objective='reg:gamma', n_jobs=-1, random_state=241093)

    # Perform Grid Search on a paramter grid
    best_clf = grid_search(clf, x_train, y_train)
Exemple #36
0
 def __init__(self):
     self.model = XGBRegressor()
Exemple #37
0
from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_selection import SelectFromModel  # feature 컬럼을 선택
from sklearn.metrics import r2_score, accuracy_score

x, y = load_boston(return_X_y=True)  # 사이킷런에서 자동으로 x, y 부여

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRegressor(n_jobs=8)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('r2: ', score)

thresholds = np.sort(
    model.feature_importances_)  # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬)
print(thresholds)  # 이 값들 모두 합치면 1 (컬럼 13개)
# r2:  0.9221188601856797
# [0.00134153 0.00363372 0.01203115 0.01220458 0.01447935 0.01479119
#  0.0175432  0.03041655 0.04246345 0.0518254  0.06949984 0.30128643
#  0.42848358]

for thresh in thresholds:  # 총 칼럼 13개 이므로 13번 훈련
    selection = SelectFromModel(
from utilities import data_prep

if __name__ == '__main__':
    # Preprocess data for xgboost.
    train_xg = pd.read_csv('../data/train.csv')
    train_xg_x, train_xg_y = data_prep.data_prep_log(train_xg)

    test_xg = pd.read_csv('../data/test.csv') #TODO: need to preprocess the data just like the train set.
    test_xg_x, test_xg_y = data_prep.data_prep_log(test_xg, False)

    # Training xgboost on CV set and predict using out-of-fold prediction
    xgboosting = XGBRegressor(n_estimators=5000, \
                            learning_rate=0.05, \
                            gamma=2, \
                            max_depth=12, \
                            min_child_weight=1, \
                            colsample_bytree=0.5, \
                            subsample=0.8, \
                            reg_alpha=1, \
                            objective='reg:linear', \
                            base_score = 7.76)

#res = xgb.cv(
#           colsample_bytree = 0.5,
#           subsample = 0.8,
#           eta = 0.05, # replace this with 0.01 for local run to achieve 1113.93
#           objective = 'reg:linear',
#           max_depth = 12,
#           alpha = 1,
#           gamma = 2,
#           min_child_weight = 1,
#           base_score = 7.76
    xbin1 = np.repeat(x[(y >= 1.0) & (y < 1.5)], 7, axis=0)
    xbin2 = np.repeat(x[(y >= 1.5) & (y < 2.0)], 5, axis=0)
    xbin3 = np.repeat(x[(y >= 2.0) & (y < 2.5)], 1, axis=0)
    xbin4 = np.repeat(x[(y >= 2.5) & (y <= 3)], 1, axis=0)

    x = np.vstack((xbin1, xbin2, xbin3, xbin4))

    ybin1 = np.repeat(y[(y >= 1.0) & (y < 1.5)], 7, axis=0)
    ybin2 = np.repeat(y[(y >= 1.5) & (y < 2.0)], 5, axis=0)
    ybin3 = np.repeat(y[(y >= 2.0) & (y < 2.5)], 1, axis=0)
    ybin4 = np.repeat(y[(y >= 2.5) & (y <= 3)], 1, axis=0)

    y = np.concatenate((ybin1, ybin2, ybin3, ybin4))

    x, y = shuffle(x, y)

    x = normalize(x)
    x_test = normalize(x_test)
    est = MetaRegressor()

    #print(do_keras(*train_test_split(x, y, test_size=0.25)))

    base_cross_val(est, x, y)
    base_cross_val(XGBRegressor(n_estimators=500), x, y)
    '''
    est.fit(x, y)
    y_pred = est.predict(x_test)
    pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('new_meta_submission.csv',index=False)
    '''
Exemple #40
0
# -*- coding: utf-8 -*-

from xgboost import XGBRegressor
import pandas as pd

train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv")
test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv")

train.drop('ID', axis=1, inplace=True)

y_train = train.pop('target')
pred_index = test.pop('ID')

reg = XGBRegressor()
reg.fit(train, y_train)
y_pred = reg.predict(test)

submit = pd.DataFrame()
submit['ID'] = pred_index
submit['target'] = y_pred
submit.to_csv('my_XGB_prediction.csv', index=False)
Exemple #41
0
df = df_train.append(df_test, ignore_index=True)

# basic inspection
df_train.shape, df_test.shape, df_train.columns.values

#Feature Selection

X_train, y_train = df_train.loc[:, [
    'voltage_min', 'current', 'soc', 'temperature_max'
]], df_train.loc[:, ['age']]

X_test, y_test = df_test.loc[:, [
    'voltage_min', 'current', 'soc', 'temperature_max'
]], df_test.loc[:, ['age']]

xgb = XGBRegressor()
xgb.fit(X_train, y_train)
imp = pd.DataFrame(xgb.feature_importances_,
                   columns=['Importance'],
                   index=X_train.columns)
imp = imp.sort_values(['Importance'], ascending=False)

print(imp)


# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))


# Define a function to calculate negative RMSE (as a score)
        encoded_cat=np.concatenate((encoded_cat, feature), axis=1)

X=np.concatenate((encoded_cat, cont), axis=1)



seed=3
test_size=.3

X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed)


model=XGBRegressor(learning_rate=0.08,
                   max_depth=10,
                   objective='reg:linear',
                   nthread=3,
                   gamma=0.2,
                   subsample=0.9,
                   n_estimators=100,
                   )
model.fit(X_train, y_train)
print(model)
y_pred=model.predict(X_test)

def mae(predicted, actual, logscale=False):
    if logscale == True:
        predexp=np.exp(predicted)
        actualexp=np.exp(actual)
        return np.mean(np.abs(predexp - actualexp))
    else:
        return np.mean(np.abs(predicted - actual))
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

### Algorithm list
algorithms = [
    LinearRegression(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor(),
    SGDRegressor(),
    SVR(),
    MLPRegressor(),
    KNeighborsRegressor(),
    BaggingRegressor(),
    XGBRegressor()
]

if best_algo == 'LinearRegression':
    algo = getattr(sklearn.linear_model, best_algo)()

if best_algo == 'SGDRegressor':
    algo = getattr(sklearn.linear_model, best_algo)()

if (best_algo
        == 'RandomForestRegressor') or (best_algo == 'AdaBoostRegressor') or (
            best_algo
            == 'GradientBoostingRegressor') or (best_algo
                                                == 'BaggingRegressor'):
    algo = getattr(sklearn.ensemble, best_algo)()
params = [100,200,300,400,500,600,700,800,1000]
test_scores = []
for param in params:
    clf = XGBRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
print test_scores
plt.plot(params, test_scores)
plt.title("n_estimators vs CV Error");
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
# 将当前figure的图保存到文件result.png
#plt.savefig('./xgboostparams.png')

# 91 16889
xgb = XGBRegressor(max_depth=6,n_estimators=400)
xgb.fit(X, y)
print mean_absolute_error(val_y,xgb.predict(val_x))
print(mean_squared_error(val_y,xgb.predict(val_x)))

#gbdt
'''
print "GradientBoostingRegressor"    
gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400)
gbdt.fit(X, y)#17083
#RandomForestRegressor 93  16938
#GradientBoostingRegressor 90 16866
#XGBRegressor 100 19939 
print mean_absolute_error(val_y,gbdt.predict(val_x))
print(mean_squared_error(val_y,gbdt.predict(val_x)))
Exemple #45
0
# clf = GridSearchCV(rf, parameters, cv=5)
# clf.fit(train_data, train_labels)
# print(clf.best_params_)
# print(clf.best_score_)
# print(clf.grid_scores_)

parameter_space = [{
    # 'n_estimators': [ 1100, 1200, 1400, 1600],  # 最优1000
    # 'max_depth': [3, 4, 5, 6],
    # 'learning_rate': [0.1, 0.2]
    'subsample': [0.5, 0.8]
}]
from xgboost import XGBRegressor

xgb = XGBRegressor(learning_rate=0.1,
                   n_estimators=1200,
                   max_depth=4,
                   gamma=0,
                   subsample=0.8)
# clf = GridSearchCV(xgb, param_grid=parameter_space, cv=5)
# #
# clf.fit(train_data, train_labels)
#
# print(clf.grid_scores_)
# print(clf.best_params_)
# print(clf.best_score_)
#
xgb.fit(train_data, train_labels)
preds = xgb.predict(test_data)
print("RMSLE Value For XGB Boost: ", rmsle(test_labels, preds))
Exemple #46
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
Exemple #47
0
y_within_cut = (~y_above_cut & ~y_below_cut)
train.fillna(0, inplace=True)
# Generate models...
ridge_1 = Ridge()
ridge_2 = Ridge()
etr = ExtraTreesRegressor(n_estimators=248,
                          max_depth=6,
                          min_samples_leaf=27,
                          max_features=0.6,
                          n_jobs=-1,
                          random_state=seed,
                          verbose=0)
xgb = xgb = XGBRegressor(n_estimators=80,
                         nthread=-1,
                         max_depth=3,
                         learning_rate=0.1,
                         reg_lambda=1,
                         subsample=1.0,
                         colsample_bytree=0.5,
                         seed=seed)

print('Training Linear Model...\n', len(linear_features), 'features')
ridge_2.fit(train.loc[y_within_cut, linear_features], train.loc[y_within_cut,
                                                                'y'])
ridge_1.fit(
    np.array(train.loc[y_within_cut, linear_features[0]]).reshape(-1, 1),
    train.loc[y_within_cut, 'y'])

print('Training XGBoost Model...\n', len(xgb_features), 'features')
xgb.fit(train[xgb_features], train.y)

print('Training ETR Model...\n', len(etr_features), 'features')
        train_target.append(target)
    else:
        test_dataset.append(row)
        test_target.append(target)


# In[41]:

#Build the model
#model=ExtraTreesRegressor()
#model=RandomForestRegressor()
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
#                  'learning_rate': 0.01, 'loss': 'ls'}
params = {'n_estimators': 400, 'max_depth': 7}
#model=GradientBoostingRegressor(**params)
model=XGBRegressor(**params)
#model=GaussianNB()
#model=Ridge()
#model=KNeighborsRegressor()
#model=DecisionTreeRegressor()
model.fit(train_dataset,train_target)

#Predict with the model
predictions=model.predict(test_dataset)


# In[51]:

### Cross Validation ###

#cv = StratifiedKFold(train_dataset, n_folds=5)
Exemple #49
0
    posterior = pm.sample(1000, tune=1000)
    try:
        pm.traceplot(posterior)
    except AttributeError:
        pass
    pm.plot_posterior(posterior)

plt.show()

# prediction
yhat = predict(X_test, posterior).T

ols_intercept, ols_theta = ols(X_train, y_train)
ols_yhat = ols_predict(X_test, ols_intercept, ols_theta)

xgr = XGBRegressor()
xgr.fit(X_train, y_train)
xgr_yhat = xgr.predict(X_test)

for i in range(3):
    n = np.random.randint(0, y_test.shape[0])
    sns.kdeplot(yhat[n], label='Bayesian Posterior Predictive_{}'.format(n))
    plt.vlines(x=ols_yhat[n],
               ymin=0,
               ymax=10,
               label='manual OLS Prediction_{}'.format(n),
               colors='blue',
               linestyles='--')
    plt.vlines(x=y_test.values[n],
               ymin=0,
               ymax=10,
Exemple #50
0
test = test.fillna(0)

train = train[train['Open'] == 1] # don't train data with open = 0

# Log and Exp
if logexp:
    train['Sales'] = np.log(train['Sales']+1)

for f in train[features]:
    if train[f].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

regressor = XGBRegressor(n_estimators=3000, nthread=-1, max_depth=12,
                   learning_rate=0.02, silent=True, subsample=0.9, colsample_bytree=0.7)

start = time.time()
if (gridsearch & sample): # only do gridsearch if we run with sampled data.
    print "Attempting GridSearchCV for XGB model"
    gscv = GridSearchCV(regressor, {
        'max_depth': [3, 5, 7, 11, 13, 17, 23],
        'n_estimators': [32, 64, 128, 512, 1024, 2048, 4096],
        'learning_rate': [0.15],
        'subsample': [0.6,0.7,0.8],
        'colsample_bytree': [0.6,0.7,0.8]},
        verbose=1, n_jobs=2)
    regressor = gscv.fit(np.array(train), train[goal])
    print(regressor.best_score_)
    print(regressor.best_params_)
else:
Exemple #51
0
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score, r2_score

x, y = load_boston(return_X_y=True)
print(x.shape)  # (506, 13)
print(y.shape)  # (506, )

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric='rmse',
          eval_set=[(x_train, y_train), (x_test, y_test)])
# rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다)

results = model.evals_result()
print("eval's results :", results)
# 100번 훈련 시켰다 (n_estimators = 100, 나무의 개수는 epochs)
# rmse로 변경하면 변경한대로 나온다 (validation_0은 train의 리스트 validation_1은 test의 리스트)
# 과적합으로 끊길 부분 1000번 돌릴 때, 530번 정도 부터 (earlystopping)

y_pred = model.predict(x_test)
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
from sklearn import cross_validation

train = pd.read_csv('../data/train_empty.csv')

features = ['store_nbr',  'item_nbr', #  'units',  'station_nbr',
 'tmax',  'tmin',  'tavg',  'depart',  'dewpoint',  'wetbulb',
 'heat',  'cool',  'snowfall',  'preciptotal',  'stnpressure',
 'sealevel',  'resultspeed',  'resultdir',  'avgspeed',
 'HZ',  'FU',  'UP',  'TSSN',  'VCTS',  'DZ',  'BR',  'FG',
 'BCFG',  'DU',  'FZRA',  'TS',  'RA',  'PL',  'GS',  'GR',
 'FZDZ',  'VCFG',  'PRFG',  'FG+',  'TSRA',  'FZFG',  'BLDU',
 'MIFG',  'SQ',  'BLSN',  'SN',  'SG',
#  'month',
#  'day',
 'day_length']
#  'sunset_hour',
#  'sunset_minute',
#  'sunrise_hour',
#  'sunrise_minute']

import xgboost

X = xgboost.DMatrix(train[features].values, missing=np.nan)
y = train["units"].values

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)
clf = XGBRegressor(silent=False)


print clf.score(X_test, y_test)
Exemple #55
-1
def XgBoost(train_linear, test_linear):
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    from xgboost import XGBRegressor
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    importance_xgb=importance_xgb[importance_xgb['features']!='Id']
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl')
    return test_prediction_xgb