def XGB(self, x_train, y_train, x_test, y_test): x_train, y_train = shuffle(x_train, y_train) xgb = XGBRegressor(max_depth=4, subsample=0.9) xgb.fit(x_train,y_train) y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1) loss = mean_squared_error(y_pred, y_test) print loss return y_pred, loss
def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self
def Stacking(real_train_tar): predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T sns.pairplot(predictions_train) learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] stack_model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_stack.fit(predictions_train, real_train_tar) xgb_stack.best_params_ write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl') model_stacking = XGBRegressor(**xgb_stack.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_stacking.fit(predictions_train,real_train_tar) end=time.time() print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train)))) print('Time elapsed: %.4f seconds' % (end-start)) y_stack_predict=model_stacking.predict(predictions_train) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,y_stack_predict) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price')
class HousePricePredictor(BaseModel): def __init__(self): self.model = XGBRegressor() def predict(self, X): X = self._prepare_data(X) return self.model.predict(X) def _prepare_data(self, X): return pd.DataFrame(X, columns=FEATURES) def fit(self, X, y): model = XGBRegressor() clf = GridSearchCV( model, { 'max_depth': [6, ], 'learning_rate': [0.05, ], 'n_estimators': [450, 470, 475, 480, 485, ] }, n_jobs=4, cv=3, verbose=1 ) clf.fit(X, y) logging.info("Best Score: {}".format(clf.best_score_)) logging.info("Best Params: {}".format(clf.best_params_)) self.model = clf.best_estimator_ return self.model def dump(self, path): self.model.save_model(path) @classmethod def load(cls, path): house_model = HousePricePredictor() house_model.model.load_model(path) return house_model
cur_score = get_score(clf, X_test, Y_test, features_to_keep) print 'Cur score:', cur_score features_to_keep_folds.append(save_if_good) print '-' * 30 selected_features = set.intersection(*[set(i) for i in features_to_keep_folds]) print len(selected_features) print 'TUNING HYPERPARAMS...' rmsle_scorer = make_scorer(rmsle, greater_is_better=False) params = { 'max_depth': [3, 4, 5], 'n_estimators': [100, 300, 500], 'min_child_weight': [1, 3, 5], 'gamma': [0, 0.5, 1] } grid = GridSearchCV(XGBRegressor(seed=0), params, cv=5, scoring=rmsle_scorer, verbose=5) grid.fit(X[list(selected_features)], Y) means = grid.cv_results_['mean_test_score'] stds = grid.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, grid.cv_results_['params']): print("%0.6f (+/-%0.03f) for %r" % (mean, std * 2, params)) print 'Best Params:', grid.best_params_ # In[80]: #tuning ridge regression hyperparameters
model2.fit(train[col], np.log1p(train['visitors'].values)) print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), model1.predict(train[col]))) print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col]))) #test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2 test['visitors'] = model2.predict(test[col]) test['visitors'] = np.expm1(test['visitors']).clip(lower=0.) sub1 = test[['id','visitors']].copy() #del train; del data; sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'), index = False) from xgboost import XGBRegressor model3 = XGBRegressor() model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False) print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[col]))) ## from hklee ## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code #dfs = { re.search('/([^/\.]*)\.csv', fn).group(1): # pd.read_csv(fn)for fn in glob.glob('../input/*.csv')} # #for k, v in dfs.items(): locals()[k] = v # #wkend_holidays = date_info.apply( # (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1) #date_info.loc[wkend_holidays, 'holiday_flg'] = 0 #date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5
def models(): extra_params_kaggle_cla = { 'n_estimators': 1200, 'max_features': 30, 'criterion': 'entropy', 'min_samples_leaf': 2, 'min_samples_split': 2, 'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs': nthread, 'random_state': seed } extra_params_kaggle_reg = { 'n_estimators': 1200, 'max_features': 30, 'criterion': 'mse', 'min_samples_leaf': 2, 'min_samples_split': 2, 'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs': nthread, 'random_state': seed } xgb_reg = { 'objective': 'reg:linear', 'max_depth': 11, 'learning_rate': 0.01, 'subsample': .9, 'n_estimators': 10000, 'colsample_bytree': 0.45, 'nthread': nthread, 'seed': seed } xgb_cla = { 'objective': 'binary:logistic', 'max_depth': 11, 'learning_rate': 0.01, 'subsample': .9, 'n_estimators': 10000, 'colsample_bytree': 0.45, 'nthread': nthread, 'seed': seed } #NN params nb_epoch = 3 batch_size = 128 esr = 402 param1 = { 'hidden_units': (256, 256), 'activation': (advanced_activations.PReLU(), advanced_activations.PReLU(), core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } param2 = { 'hidden_units': (1024, 1024), 'activation': (advanced_activations.PReLU(), advanced_activations.PReLU(), core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } clfs = [ (D2, XGBClassifier(**xgb_cla)), (D11, XGBClassifier(**xgb_cla)), (D2, XGBRegressor(**xgb_reg)), (D11, XGBRegressor(**xgb_reg)), (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)) ] for clf in clfs: yield clf
'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_lag_6', 'item_cnt_month_lag_12', 'month', 'days', 'item_shop_first_sale', 'item_first_sale' ]] X_train = data[data.month_idx < 21].drop(['item_cnt_month'], axis=1) y_train = data[data.month_idx < 21]['item_cnt_month'] X_valid = data[data.month_idx == 21].drop(['item_cnt_month'], axis=1) y_valid = data[data.month_idx == 21]['item_cnt_month'] X_test = data[data.month_idx == 22].drop(['item_cnt_month'], axis=1) ts = time.time() model = XGBRegressor(max_depth=8, n_estimators=1000, min_child_weight=300, colsample_bytree=0.8, eta=0.3, seed=42) model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=True, early_stopping_rounds=10) time.time() - ts y_pred = model.predict(X_valid).clip(0, 20) y_test = model.predict(X_test).clip(0, 20)
df_valid = df_valid.set_index('key_0') typ = df_valid.dtypes df_valid.to_csv('df_valid_cat.csv', header=None, index=False) df_train.columns RFR = RandomForestRegressor() RFR.fit(X_train, Y_train) RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test, RFR_preds)) RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) XGB = XGBRegressor() XGB.fit(X_train, Y_train, verbose=False) XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int) print(mean_absolute_error(Y_test,XGB_preds)) XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64')) GBR = GradientBoostingRegressor() GBR.fit(X_train, Y_train) GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test,GBR_preds)) GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test) from sklearn.model_selection import KFold
import matplotlib.pyplot as plt import numpy as np # 회귀 모델 x, y = load_boston(return_X_y=True) print(x.shape) print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1) model.fit(x_train, y_train) threshold = np.sort(model.feature_importances_) for thres in threshold: selection = SelectFromModel(model, threshold=thres, prefit=True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) selection_model = XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1)
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import GradientBoostingRegressor from xgboost import XGBRegressor from sklearn.preprocessing import PolynomialFeatures as pf from sklearn import linear_model as lm train = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\train.csv') test = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\test.csv') train = train.drop(columns=['Index', 'District']) test = test.drop(columns=['Index', 'District']) base = [ RandomForestRegressor(n_estimators=100, max_depth=10), ExtraTreesRegressor(n_estimators=90, max_depth=15), GradientBoostingRegressor(n_estimators=60, max_depth=5), XGBRegressor(n_estimators=50, max_depth=5), BaggingRegressor(n_estimators=50, base_estimator=lm.LinearRegression()) ] name = ['RFR', 'ETR', 'GBR', 'XGBR', 'BAR'] df1 = pd.DataFrame() c = 0 train_x = train.drop(columns=['Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['Rainfall']) test_y = test['Rainfall'] d1 = {} for i, j in zip(base, name): print(j, c) if j == 'BAR': poly = pf(degree=4) train_x = poly.fit_transform(train_x)
def get_xgb_imp(xgb, feat_names): from numpy import array imp_vals = xgb.booster().get_fscore() imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))} total = array(imp_dict.values()).sum() return {k:v/total for k,v in imp_dict.items()} Y_train = np.log1p(train_df['price_doc'].values) X_train = train_df.ix[:, train_df.columns != 'price_doc'].values X_test = test_df.values ################################## XGBRegressor ############################### #Initialize Model xgb = XGBRegressor() #Create cross-validation cv = TimeSeriesSplit(n_splits=5) #Train & Test Model cross_val_results = cross_val_score(xgb, X_train, Y_train, cv=cv, scoring='neg_mean_squared_error') print(cross_val_results.mean()) model = xgb.fit(X_train, Y_train) # model.feature_importances_; from xgboost import XGBRegressor #Get Data Y_train = train_df['price_doc'].values
if df[heads[i]].dtypes == 'O': df[heads[i]] = lb_make.fit_transform(df[heads[i]].astype(str)) #extracting input and output features X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values # prepare configuration for cross validation test harness seed = 7 # prepare models #model evaluation models = [] models.append(('XGBoost', XGBRegressor())) models.append(('GBR', ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.1, n_estimators=250, max_depth=3, learning_rate=.1, min_samples_leaf=9, min_samples_split=9))) models.append(('RFR', RandomForestRegressor())) # evaluate each model in turn results = [] names = []
X_train = tr_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True) X_test = ts_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True) y_train = tr_user["loan_sum"].reset_index(drop=True) # Caution! All models and parameter values are just # demonstrational and shouldn't be considered as recommended. # Initialize 1-st level models. models = [ ExtraTreesRegressor(random_state = 0, n_jobs = -1, n_estimators = 300, max_depth = 3), RandomForestRegressor(random_state = 0, n_jobs = -1, n_estimators = 300, max_depth = 3), XGBRegressor(seed = 0, learning_rate = 0.05, n_estimators = 300, max_depth = 3), LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300) ] # Compute stacking features S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True, metric = mean_squared_error, n_folds = 5, shuffle = True, random_state = 0, verbose = 2) # Fit 2-nd level model model = LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300) model = model.fit(S_train, y_train) y_pred = model.predict(S_test) id_test = ts_user['uid']
from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-16.688023353137517 exported_pipeline = make_pipeline( StackingEstimator(estimator=XGBRegressor(learning_rate=0.01, max_depth=1, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.55)), StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.9, learning_rate=0.001, loss="ls", max_depth=2, max_features=0.7500000000000001, min_samples_leaf=12, min_samples_split=17, n_estimators=100, subsample=1.0)), Nystroem(gamma=0.25, kernel="laplacian", n_components=10), GradientBoostingRegressor(alpha=0.95, learning_rate=0.1,
x_data = train.iloc[:, :71] y_data = train.iloc[:, -4:] x_data = x_data.fillna(x_data.mean()) test = test.fillna(test.mean()) x = x_data.values y = y_data.values x_pred = test.values x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=33) model = MultiOutputRegressor(XGBRegressor()) model.fit(x_train, y_train) y_pred1 = model.predict(x_test) print('mae: ', mean_absolute_error(y_test, y_pred1)) ## feature_importances def plot_feature_importances(model): plt.figure(figsize=(10, 40)) n_features = x_data.shape[1] # n_features = column개수 plt.barh( np.arange(n_features), model.feature_importances_, # barh : 가로방향 bar chart
rf_grid = my_search.predict(X_trans) rf_grid_rmsle = RMSLe_(y_train_trans, rf_grid) output = output.append( { "model": "RF grid search (max_deth 8)", "R2 mean": ranked_res1["mean_test_score"][4], "R2 std": ranked_res1["std_test_score"][4], "RMSLE": rf_grid_rmsle }, ignore_index=True) # Gradient Boosting xgb = XGBRegressor(n_estimators=50, max_depth=5, learning_rate=0.1, random_state=42) xgb.fit(X_trans, y_train_trans) cross_val_xgb = cross_val_score(xgb, X_trans, y_train_trans, cv=5) pred_xgb = xgb.predict(X_trans) rmlse_xgb = RMSLe_(y_train_trans, pred_xgb) output = output.append( { "model": "GB 0.1 ", "R2 mean": cross_val_xgb.mean(), "R2 std": cross_val_xgb.std(), "RMSLE": rmlse_xgb },
# 'min_child_weight': np.linspace(200, 250, 5, dtype='int32'), ### Third param tunning # 'gamma': np.linspace(0.0, 0.5, 5), ### Fourth param tunning # 'subsample': np.linspace(0.6, 0.9, 4), # 'colsample_bytree': np.linspace(0.6, 0.9, 4), ### Fifith param tunning # 'reg_alpha': np.linspace(1e-5, 100, 5) } xgbr = XGBRegressor( nthread=25, seed=42, learning_rate=0.02, n_estimators=3000, max_depth=11, min_child_weight=225, gamma=0.125, colsample_bytree=0.6, subsample=0.9, reg_alpha=25, ) """ fit_params_xgb = {'eval_metric': 'rmse', 'early_stopping_rounds': 30, 'verbose': False, 'eval_set': [(X_test, y_test)], } bag = BaggingRegressor(xgbr, n_estimators=5, max_samples=0.85,
class PrudentialRegressorFO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
for param in params: clf = XGBRegressor(n_estimators=param) test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) print test_scores plt.plot(params, test_scores) plt.title("n_estimators vs CV Error"); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() # 将当前figure的图保存到文件result.png #plt.savefig('./xgboostparams.png') ''' # XGBRegressor 91 16889 print "XGBRegressor" xgb = XGBRegressor(max_depth=6,n_estimators=400) xgb.fit(X, y) print mean_absolute_error(val_y,xgb.predict(val_x)) print mean_squared_error(val_y,xgb.predict(val_x)) #gbdt print "GradientBoostingRegressor" gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400) gbdt.fit(X, y)#17083 #RandomForestRegressor 93 16938 #GradientBoostingRegressor 90 16866 print mean_absolute_error(val_y,gbdt.predict(val_x)) print mean_squared_error(val_y,gbdt.predict(val_x)) #xgb & gbdt
def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test): clfs = list() cvClfs = list() print "Building RF1" rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShortCV.fit(cvX_train, cvy_train); print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True); rfShort.fit(X_train,y_train) cvClfs.append(rfShortCV) clfs.append(rfShort) pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True); print "Building SVM" clfSVRCV = SVR(C=10.0) clfSVR = SVR(C=10.0) clfSVRCV.fit(cvX_train, cvy_train); print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True); print "Building RF2" rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLongCV.fit(cvX_train, cvy_train); print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test)) rfLong.fit(X_train,y_train) cvClfs.append(rfLongCV) clfs.append(rfLong) pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True); print "Building GB1" regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV1.fit(cvX_train, cvy_train); print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test)) regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB1.fit(X_train,y_train) cvClfs.append(regGBCV1) clfs.append(regGB1) pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True); print 'Building GB2' regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV2.fit(cvX_train, cvy_train); print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test)) regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB2.fit(X_train,y_train) cvClfs.append(regGBCV2) clfs.append(regGB2) pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True); print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True); print "Building XGB1" xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgbCV1.fit(cvX_train, cvy_train); xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgb1.fit(X_train,y_train); print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test)); cvClfs.append(xgbCV1) clfs.append(xgb1) pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True); print "Building XGB2" params = {} params["objective"] = "reg:linear" params["learning_rate"] = 0.005 params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.75 params["silent"] = 1 params["max_depth"] = 7 params["n_estimators"] = 3000 params['gamma'] = 1.25 params['nthread'] = -1 print 'XGBoost Training Process Started' xgbCV2 = XGBRegressor(**params); xgbCV2.fit(cvX_train, cvy_train); print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test)); xgb2 = XGBRegressor(**params); xgb2.fit(X_train,y_train); cvClfs.append(xgbCV2) clfs.append(xgb2) pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True); # Return the cross validated models and the actual fitted models separately. return [clfs,cvClfs];
################################################################### # XGBoost Model Building ################################################################### """ I will build and test the models on Y2, since this is where the maximum improvement can be made to improve performance. Benchmark - MSE of 0.056 """ # Basic Model Building - Ch.4 xgb_1 = XGBRegressor() xgb_1.fit(X_train, Y_train) y_pred = xgb_1.predict(X_test) predictions = [round(value) for value in y_pred] MSE_1 = mean_squared_error(Y_test, predictions) print("MSE is " + str(MSE_1)) plot_tree(xgb_1) # Model Using KFold Cross Validation xgb_2 = XGBRegressor() kfold = KFold(n_splits=10, random_state=7) results = cross_val_score(xgb_2, X_train, Y_train, cv=kfold) xgb_2.fit(X_train, Y_train) Y_pred = xgb_2.predict(X_test)
def get_fitted_clf(X_train, Y_train, features): clf = XGBRegressor(seed=0) clf.fit(X_train[features], Y_train) return clf
train=train.drop(['total_sales','outlet_no'],1) outlet=test.outlet_no test=test.drop('outlet_no',1) # In[199]: from xgboost import XGBRegressor from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV # In[209]: model = XGBRegressor() learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3] n_estimators=[100,200,300,400,500] param_grid = dict(learning_rate=learning_rate,n_estimators=n_estimators) kfold = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring="mean_absolute_error", n_jobs=-1, cv=kfold) # In[210]: result = grid_search.fit(train,y) # summarize results print("Best: %f using %s" % (result.best_score_, result.best_params_)) # In[211]:
'max_depth': ( 2, 6, ), # default 3 'n_estimators': ( 50, 100, 150, ), # default 100 'subsample': ( 0.6, 0.4, ), }] est = XGBRegressor(random_state=69) gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2') #params = { # "colsample_bytree": uniform(0.7, 0.3), # "gamma": uniform(0, 0.5), # "learning_rate": uniform(0.03, 0.3), # default 0.1 # "max_depth": randint(2, 6), # default 3 # "n_estimators": randint(100, 150), # default 100 # "subsample": uniform(0.6, 0.4) #}
model_GBoost = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.03, max_depth=3, max_features=0.4, min_samples_leaf=20, min_samples_split=10, loss='huber', random_state=seed) model_xgb = XGBRegressor(colsample_bytree=0.35, gamma=0.027, learning_rate=0.03, max_depth=4, min_child_weight=1.7817, n_estimators=3000, reg_alpha=0.43, reg_lambda=0.88, subsample=0.5213, silent=1, random_state=seed) model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=10, learning_rate=0.03, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9,
{ "n_estimators": [90, 100, 110], "learning_rate": [0.001, 0.01, 0.1], "max_depth": [4, 5, 6], "colsample_bytree": [0.6, 0.9, 1] }, { "n_estimators": [90, 110], "learning_rate": [0.001, 0.1, 0.5], "max_depth": [4, 5, 6], "colsample_bytree": [0.6, 0.9, 1], "colsample_bylevel": [0.6, 0.7, 0.9] }, ] model = GridSearchCV(XGBRegressor(), parameters, cv=kfold) model.fit(x_train, y_train) print('최적의 매개변수 :', model.best_estimator_) y_pred = model.predict(x_test) print('최종 정답률 :', r2_score(y_test, y_pred)) print('최종 정답률 :', model.score(x_test, y_test)) ''' 최적의 매개변수 : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.6, colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=90, n_jobs=8, num_parallel_tree=1, random_state=0,
x, y = load_boston(return_X_y=True) # 사이킷런에서 자동으로 x, y 부여 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) parameter = [{ 'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.3, 0.001, 0.01], 'max_depth': [4, 5, 6] }] model = RandomizedSearchCV(XGBRegressor(n_jobs=8), parameter) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('r2: ', score) thresholds = np.sort(model.best_estimator_.feature_importances_ ) # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬) # print(model.best_estimator_.feature_importances_) print(thresholds) # 이 값들 모두 합치면 1 (컬럼 13개) # r2: 0.9188116974777065 # 0.02678531 0.03278282 0.03606399 0.04534625 0.05393368 0.27339098 # 0.4654915 ] model = model.best_estimator_ # 이미 모델안에 best_estimator_ 있으므로 XGB 안해도된다
shuffle=False) X_train, X_mean, X_std = normalize(X_train) X_test = normalize_test(X_test, X_mean, X_std) y_train, y_mean, y_std = normalize(y_train) # y_test = normalize_test(y_test, y_mean, y_std) # ============== # MODEL CREATION # ============== svr_model = SVR() rf_model = RandomForestRegressor(n_estimators=100) adb_model = AdaBoostRegressor(n_estimators=100) xgb_model = XGBRegressor() svr_model.fit(X_train, y_train) joblib.dump( svr_model, path + 'models/' + str(data_interval) + 'min/svr_' + stock + '.pkl') # svr_model = joblib.load(path+'models/'+str(data_interval)+'min/svr_'+stock+'.pkl') rf_model.fit(X_train, y_train) joblib.dump( rf_model, path + 'models/' + str(data_interval) + 'min/rf_' + stock + '.pkl') # rf_model = joblib.load(path+'models/'+str(data_interval)+'min/rf_'+stock+'.pkl') adb_model.fit(X_train, y_train) joblib.dump(
def __init__(self): self.classifier_param_list = [ { "model": [DecisionTreeClassifier()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [RandomForestClassifier()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [MLPClassifier()], "model__activation": ["identity", "logistic", "tanh", "relu"], "model__alpha": [0.001, 0.01, 0.1], }, { "model": [LogisticRegression(fit_intercept=False)], "model__C": [1, 5, 10], }, { "model": [BaggingClassifier()], "model__n_estimators": [5, 10, 15], "model__max_features": [0.25, 0.5, 1.0], }, { "model": [AdaBoostClassifier()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [XGBClassifier()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [lgb.LGBMClassifier()], "model__learning_rate": [0.01], }, { "model": [CatBoostClassifier()], "model__learning_rate": [0.01], }, ] self.regressor_param_list = [ { "model": [DecisionTreeRegressor()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [RandomForestRegressor()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [MLPRegressor()], "model__activation": ["identity", "logistic", "tanh", "relu"], "model__alpha": [0.001, 0.01, 0.1], }, { "model": [ElasticNet(fit_intercept=False)], "model__alpha": [0.001, 0.01, 0.1], "model__l1_ratio": [0.25, 0.5, 1.0], }, { "model": [BaggingRegressor()], "model__n_estimators": [5, 10, 15], "model__max_features": [0.25, 0.5, 1.0], }, { "model": [AdaBoostRegressor()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [XGBRegressor()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [lgb.LGBMRegressor()], "model__learning_rate": [0.01], }, { "model": [CatBoostRegressor()], "model__learning_rate": [0.01], }, ]
lgbm_parameter = [ { 'n_estimators': [10000], 'learning_rate': [0.001, 0.01, 0.0025, 0.075] }, ] lgbm_fit_params = { 'verbose': False, 'eval_metric': ["logloss", "rmse"], 'eval_set': [(x_train, y_train), (x_test, y_test)], 'early_stopping_rounds': 20 } #### XGB 셀렉트 start1 = time.time() model_XGB = XGBRegressor() model_XGB.fit(x_train, y_train) score = model_XGB.score(x_test, y_test) print("r2 : ", score) thresholds = np.sort(model_XGB.feature_importances_) print(thresholds) print(x_train.shape) print("========================") best_x_train = x_train best_x_train = x_test best_score = score best_model = model_XGB
mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y) print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea)) ''' # clf = XGBRegressor() 17165 # XGBRegressor(n_estimators=400) 16330 ''' params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500] test_scores = [] for param in params: clf = XGBRegressor(n_estimators=400,learning_rate=param) test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error" + str(params)); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() ''' my_model = XGBRegressor(n_estimators=400) my_model.fit(train_X, train_y,verbose=False) predictions = my_model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y))) #save model #joblib.dump(melbourne_model,'model.pickle') #load model #model = joblib.load('model.pickle')
sc_X = MinMaxScaler() X = sc_X.fit_transform(X) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0) grid_params = { 'booster': 'gbtree', 'objective': 'reg:linear', 'learning_rate': 0.05, 'max_depth': 10, 'gamma': 0, 'min_child_weight': 1, 'grow_policy': 'lossguide', 'silent': 1, 'subsample': 0.7, 'colsample_bytree': 0.7, 'n_estimators': 100, 'tree_method': 'gpu_exact', } estimator = XGBRegressor(**grid_params) estimator.fit(X_train, Y_train) Y_predict = estimator.predict(X_test) final_score = RMSLE(Y_predict, Y_test) print('Scorul final pe teste(RMLSE): ') print(final_score)
class Blending(BaseEnsembleModel): def __init__(self, stats, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='xgboost'): super().__init__(stats=stats, ensemble_method='blending', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) try: from xgboost import XGBClassifier except: warnings.warn( "Xgboost is not imported! Blending will use linear model instead!" ) meta_learner = 'linear' # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70) def fit(self, data): # Split training data for phase 1 and phase 2 test_size = 0.2 # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data if self.task_type in CLS_TASKS: x_p1, x_p2, y_p1, y_p2 = train_test_split( X, y, test_size=test_size, stratify=data.data[1], random_state=self.seed) else: x_p1, x_p2, y_p1, y_p2 = train_test_split( X, y, test_size=test_size, random_state=self.seed) for _config in configs: if self.base_model_mask[model_cnt] == 1: estimator = fetch_predict_estimator( self.task_type, _config, x_p1, y_p1) with open( os.path.join( self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 self.meta_learner.fit(feature_p2, y_p2) return self def get_feature(self, data, solvers): # Predict the labels via blending feature_p2 = None model_cnt = 0 suc_cnt = 0 for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for train_node in train_list: test_node = solvers[algo_id].optimizer['fe'].apply( data, train_node) for _ in configs: if self.base_model_mask[model_cnt] == 1: with open( os.path.join( self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'rb') as f: estimator = pkl.load(f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(test_node.data[0]) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(data.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict( test_node.data[0]).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(data.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 return feature_p2 def predict(self, data, solvers): feature_p2 = self.get_feature(data, solvers) # Get predictions from meta-learner if self.task_type in CLS_TASKS: final_pred = self.meta_learner.predict_proba(feature_p2) else: final_pred = self.meta_learner.predict(feature_p2) return final_pred
verbose=1) grid_search_obj.fit(x_train, y_train) print('The following is the best parameter setting for this problem:') print(grid_search_obj.best_params_) print('Training score on the best estimator: {}'.format( grid_search_obj.best_score_)) return grid_search_obj.best_estimator_ if __name__ == '__main__': import pandas as pd from xgboost import XGBRegressor # Read data from a file and split into data and labels. path_to_file = 'OnlineNewsPopularity/OnlineNewsPopularity.csv' data = pd.read_csv(path_to_file, header=0).drop('url', axis=1) labels = pd.Series(data.pop(' shares')) # Split dataset into train and test sets. x_train, x_test, y_train, y_test = split_data(data, labels) # Create an XGBRegressor object clf = XGBRegressor(objective='reg:gamma', n_jobs=-1, random_state=241093) # Perform Grid Search on a paramter grid best_clf = grid_search(clf, x_train, y_train)
def __init__(self): self.model = XGBRegressor()
from xgboost import XGBClassifier, XGBRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split import numpy as np from sklearn.feature_selection import SelectFromModel # feature 컬럼을 선택 from sklearn.metrics import r2_score, accuracy_score x, y = load_boston(return_X_y=True) # 사이킷런에서 자동으로 x, y 부여 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_jobs=8) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('r2: ', score) thresholds = np.sort( model.feature_importances_) # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬) print(thresholds) # 이 값들 모두 합치면 1 (컬럼 13개) # r2: 0.9221188601856797 # [0.00134153 0.00363372 0.01203115 0.01220458 0.01447935 0.01479119 # 0.0175432 0.03041655 0.04246345 0.0518254 0.06949984 0.30128643 # 0.42848358] for thresh in thresholds: # 총 칼럼 13개 이므로 13번 훈련 selection = SelectFromModel(
from utilities import data_prep if __name__ == '__main__': # Preprocess data for xgboost. train_xg = pd.read_csv('../data/train.csv') train_xg_x, train_xg_y = data_prep.data_prep_log(train_xg) test_xg = pd.read_csv('../data/test.csv') #TODO: need to preprocess the data just like the train set. test_xg_x, test_xg_y = data_prep.data_prep_log(test_xg, False) # Training xgboost on CV set and predict using out-of-fold prediction xgboosting = XGBRegressor(n_estimators=5000, \ learning_rate=0.05, \ gamma=2, \ max_depth=12, \ min_child_weight=1, \ colsample_bytree=0.5, \ subsample=0.8, \ reg_alpha=1, \ objective='reg:linear', \ base_score = 7.76) #res = xgb.cv( # colsample_bytree = 0.5, # subsample = 0.8, # eta = 0.05, # replace this with 0.01 for local run to achieve 1113.93 # objective = 'reg:linear', # max_depth = 12, # alpha = 1, # gamma = 2, # min_child_weight = 1, # base_score = 7.76
xbin1 = np.repeat(x[(y >= 1.0) & (y < 1.5)], 7, axis=0) xbin2 = np.repeat(x[(y >= 1.5) & (y < 2.0)], 5, axis=0) xbin3 = np.repeat(x[(y >= 2.0) & (y < 2.5)], 1, axis=0) xbin4 = np.repeat(x[(y >= 2.5) & (y <= 3)], 1, axis=0) x = np.vstack((xbin1, xbin2, xbin3, xbin4)) ybin1 = np.repeat(y[(y >= 1.0) & (y < 1.5)], 7, axis=0) ybin2 = np.repeat(y[(y >= 1.5) & (y < 2.0)], 5, axis=0) ybin3 = np.repeat(y[(y >= 2.0) & (y < 2.5)], 1, axis=0) ybin4 = np.repeat(y[(y >= 2.5) & (y <= 3)], 1, axis=0) y = np.concatenate((ybin1, ybin2, ybin3, ybin4)) x, y = shuffle(x, y) x = normalize(x) x_test = normalize(x_test) est = MetaRegressor() #print(do_keras(*train_test_split(x, y, test_size=0.25))) base_cross_val(est, x, y) base_cross_val(XGBRegressor(n_estimators=500), x, y) ''' est.fit(x, y) y_pred = est.predict(x_test) pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('new_meta_submission.csv',index=False) '''
# -*- coding: utf-8 -*- from xgboost import XGBRegressor import pandas as pd train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv") test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv") train.drop('ID', axis=1, inplace=True) y_train = train.pop('target') pred_index = test.pop('ID') reg = XGBRegressor() reg.fit(train, y_train) y_pred = reg.predict(test) submit = pd.DataFrame() submit['ID'] = pred_index submit['target'] = y_pred submit.to_csv('my_XGB_prediction.csv', index=False)
df = df_train.append(df_test, ignore_index=True) # basic inspection df_train.shape, df_test.shape, df_train.columns.values #Feature Selection X_train, y_train = df_train.loc[:, [ 'voltage_min', 'current', 'soc', 'temperature_max' ]], df_train.loc[:, ['age']] X_test, y_test = df_test.loc[:, [ 'voltage_min', 'current', 'soc', 'temperature_max' ]], df_test.loc[:, ['age']] xgb = XGBRegressor() xgb.fit(X_train, y_train) imp = pd.DataFrame(xgb.feature_importances_, columns=['Importance'], index=X_train.columns) imp = imp.sort_values(['Importance'], ascending=False) print(imp) # Define a function to calculate RMSE def rmse(y_true, y_pred): return np.sqrt(np.mean((y_true - y_pred)**2)) # Define a function to calculate negative RMSE (as a score)
encoded_cat=np.concatenate((encoded_cat, feature), axis=1) X=np.concatenate((encoded_cat, cont), axis=1) seed=3 test_size=.3 X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed) model=XGBRegressor(learning_rate=0.08, max_depth=10, objective='reg:linear', nthread=3, gamma=0.2, subsample=0.9, n_estimators=100, ) model.fit(X_train, y_train) print(model) y_pred=model.predict(X_test) def mae(predicted, actual, logscale=False): if logscale == True: predexp=np.exp(predicted) actualexp=np.exp(actual) return np.mean(np.abs(predexp - actualexp)) else: return np.mean(np.abs(predicted - actual))
from sklearn.neural_network import MLPRegressor from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor ### Algorithm list algorithms = [ LinearRegression(), RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), SGDRegressor(), SVR(), MLPRegressor(), KNeighborsRegressor(), BaggingRegressor(), XGBRegressor() ] if best_algo == 'LinearRegression': algo = getattr(sklearn.linear_model, best_algo)() if best_algo == 'SGDRegressor': algo = getattr(sklearn.linear_model, best_algo)() if (best_algo == 'RandomForestRegressor') or (best_algo == 'AdaBoostRegressor') or ( best_algo == 'GradientBoostingRegressor') or (best_algo == 'BaggingRegressor'): algo = getattr(sklearn.ensemble, best_algo)()
params = [100,200,300,400,500,600,700,800,1000] test_scores = [] for param in params: clf = XGBRegressor(n_estimators=param) test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) print test_scores plt.plot(params, test_scores) plt.title("n_estimators vs CV Error"); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() # 将当前figure的图保存到文件result.png #plt.savefig('./xgboostparams.png') # 91 16889 xgb = XGBRegressor(max_depth=6,n_estimators=400) xgb.fit(X, y) print mean_absolute_error(val_y,xgb.predict(val_x)) print(mean_squared_error(val_y,xgb.predict(val_x))) #gbdt ''' print "GradientBoostingRegressor" gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400) gbdt.fit(X, y)#17083 #RandomForestRegressor 93 16938 #GradientBoostingRegressor 90 16866 #XGBRegressor 100 19939 print mean_absolute_error(val_y,gbdt.predict(val_x)) print(mean_squared_error(val_y,gbdt.predict(val_x)))
# clf = GridSearchCV(rf, parameters, cv=5) # clf.fit(train_data, train_labels) # print(clf.best_params_) # print(clf.best_score_) # print(clf.grid_scores_) parameter_space = [{ # 'n_estimators': [ 1100, 1200, 1400, 1600], # 最优1000 # 'max_depth': [3, 4, 5, 6], # 'learning_rate': [0.1, 0.2] 'subsample': [0.5, 0.8] }] from xgboost import XGBRegressor xgb = XGBRegressor(learning_rate=0.1, n_estimators=1200, max_depth=4, gamma=0, subsample=0.8) # clf = GridSearchCV(xgb, param_grid=parameter_space, cv=5) # # # clf.fit(train_data, train_labels) # # print(clf.grid_scores_) # print(clf.best_params_) # print(clf.best_score_) # xgb.fit(train_data, train_labels) preds = xgb.predict(test_data) print("RMSLE Value For XGB Boost: ", rmsle(test_labels, preds))
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
y_within_cut = (~y_above_cut & ~y_below_cut) train.fillna(0, inplace=True) # Generate models... ridge_1 = Ridge() ridge_2 = Ridge() etr = ExtraTreesRegressor(n_estimators=248, max_depth=6, min_samples_leaf=27, max_features=0.6, n_jobs=-1, random_state=seed, verbose=0) xgb = xgb = XGBRegressor(n_estimators=80, nthread=-1, max_depth=3, learning_rate=0.1, reg_lambda=1, subsample=1.0, colsample_bytree=0.5, seed=seed) print('Training Linear Model...\n', len(linear_features), 'features') ridge_2.fit(train.loc[y_within_cut, linear_features], train.loc[y_within_cut, 'y']) ridge_1.fit( np.array(train.loc[y_within_cut, linear_features[0]]).reshape(-1, 1), train.loc[y_within_cut, 'y']) print('Training XGBoost Model...\n', len(xgb_features), 'features') xgb.fit(train[xgb_features], train.y) print('Training ETR Model...\n', len(etr_features), 'features')
train_target.append(target) else: test_dataset.append(row) test_target.append(target) # In[41]: #Build the model #model=ExtraTreesRegressor() #model=RandomForestRegressor() #params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'ls'} params = {'n_estimators': 400, 'max_depth': 7} #model=GradientBoostingRegressor(**params) model=XGBRegressor(**params) #model=GaussianNB() #model=Ridge() #model=KNeighborsRegressor() #model=DecisionTreeRegressor() model.fit(train_dataset,train_target) #Predict with the model predictions=model.predict(test_dataset) # In[51]: ### Cross Validation ### #cv = StratifiedKFold(train_dataset, n_folds=5)
posterior = pm.sample(1000, tune=1000) try: pm.traceplot(posterior) except AttributeError: pass pm.plot_posterior(posterior) plt.show() # prediction yhat = predict(X_test, posterior).T ols_intercept, ols_theta = ols(X_train, y_train) ols_yhat = ols_predict(X_test, ols_intercept, ols_theta) xgr = XGBRegressor() xgr.fit(X_train, y_train) xgr_yhat = xgr.predict(X_test) for i in range(3): n = np.random.randint(0, y_test.shape[0]) sns.kdeplot(yhat[n], label='Bayesian Posterior Predictive_{}'.format(n)) plt.vlines(x=ols_yhat[n], ymin=0, ymax=10, label='manual OLS Prediction_{}'.format(n), colors='blue', linestyles='--') plt.vlines(x=y_test.values[n], ymin=0, ymax=10,
test = test.fillna(0) train = train[train['Open'] == 1] # don't train data with open = 0 # Log and Exp if logexp: train['Sales'] = np.log(train['Sales']+1) for f in train[features]: if train[f].dtype=='object': lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) regressor = XGBRegressor(n_estimators=3000, nthread=-1, max_depth=12, learning_rate=0.02, silent=True, subsample=0.9, colsample_bytree=0.7) start = time.time() if (gridsearch & sample): # only do gridsearch if we run with sampled data. print "Attempting GridSearchCV for XGB model" gscv = GridSearchCV(regressor, { 'max_depth': [3, 5, 7, 11, 13, 17, 23], 'n_estimators': [32, 64, 128, 512, 1024, 2048, 4096], 'learning_rate': [0.15], 'subsample': [0.6,0.7,0.8], 'colsample_bytree': [0.6,0.7,0.8]}, verbose=1, n_jobs=2) regressor = gscv.fit(np.array(train), train[goal]) print(regressor.best_score_) print(regressor.best_params_) else:
from xgboost import XGBClassifier, XGBRegressor from sklearn.model_selection import train_test_split from sklearn.datasets import load_boston from sklearn.metrics import accuracy_score, r2_score x, y = load_boston(return_X_y=True) print(x.shape) # (506, 13) print(y.shape) # (506, ) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric='rmse', eval_set=[(x_train, y_train), (x_test, y_test)]) # rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다) results = model.evals_result() print("eval's results :", results) # 100번 훈련 시켰다 (n_estimators = 100, 나무의 개수는 epochs) # rmse로 변경하면 변경한대로 나온다 (validation_0은 train의 리스트 validation_1은 test의 리스트) # 과적합으로 끊길 부분 1000번 돌릴 때, 530번 정도 부터 (earlystopping) y_pred = model.predict(x_test)
def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
from sklearn import cross_validation train = pd.read_csv('../data/train_empty.csv') features = ['store_nbr', 'item_nbr', # 'units', 'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir', 'avgspeed', 'HZ', 'FU', 'UP', 'TSSN', 'VCTS', 'DZ', 'BR', 'FG', 'BCFG', 'DU', 'FZRA', 'TS', 'RA', 'PL', 'GS', 'GR', 'FZDZ', 'VCFG', 'PRFG', 'FG+', 'TSRA', 'FZFG', 'BLDU', 'MIFG', 'SQ', 'BLSN', 'SN', 'SG', # 'month', # 'day', 'day_length'] # 'sunset_hour', # 'sunset_minute', # 'sunrise_hour', # 'sunrise_minute'] import xgboost X = xgboost.DMatrix(train[features].values, missing=np.nan) y = train["units"].values X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0) clf = XGBRegressor(silent=False) print clf.score(X_test, y_test)
def XgBoost(train_linear, test_linear): learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ from xgboost import XGBRegressor """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_xgb=importance_xgb[importance_xgb['features']!='Id'] importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl') return test_prediction_xgb