def train(x_train, y_train, x_valid, y_valid, n_estimators_0, objective, eval_metric, scoring, rmspe_xg, kfold, esr): # 1-设置参数初始值 print("1-设置参数初始值") reg = XGBRegressor( # General Parameters booster="gbtree", silent=1, nthread=-1, n_jobs=-1, # Booster Parameters learning_rate=0.1, n_estimators=n_estimators_0, gamma=0, max_depth=7, min_child_weight=0.001, subsample=0.9, colsample_bytree=0.9, reg_alpha=0, reg_lambda=1, max_delta_step=0, scale_pos_weight=1, # Learning Task Parameters objective=objective, eval_metric=eval_metric, seed=0) # 2-训练最优弱分类器个数:n_estimators_1 print("2-训练最优弱分类器个数:n_estimators_1") xgb_param = reg.get_xgb_params() d_train = xgb.DMatrix(x_train, y_train) d_valid = xgb.DMatrix(x_valid, y_valid) watchlist = [(d_train, "train"), (d_valid, "valid")] t_begin = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_1 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_1) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_1, (t1 - t_begin))) # 3-暴力搜索:learning_rate print("3-暴力搜索:learning_rate") param = {"learning_rate": [0.1, 0.2, 0.3]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_3 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() #model_3.grid_scores_; model_3.best_score_; model_3.best_estimator_ best_param = model_3.best_params_["learning_rate"] reg.set_params(learning_rate=best_param) xgb_param = reg.get_xgb_params() print("learning_rate:%s, 用时:%s" % (best_param, (t1 - t0))) # 4-暴力搜索:max_depth, min_child_weight print("4-暴力搜索:max_depth, min_child_weight") param = { "max_depth": [3, 5, 7, 9, 11], "min_child_weight": [0.001, 0.01, 0.1, 1] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_4 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_4.best_params_["max_depth"] best_param_2 = model_4.best_params_["min_child_weight"] print("max_depth:%s,min_child_weight:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 5-精确搜索:max_depth print("5-精确搜索:max_depth") param = {"max_depth": [best_param_1 - 1, best_param_1, best_param_1 + 1]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_5 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_5.best_params_["max_depth"] reg.set_params(max_depth=best_param_1) xgb_param = reg.get_xgb_params() print("max_depth:%s,用时:%s" % (best_param_1, (t1 - t0))) # 6-暴力搜索:gamma print("6-暴力搜索:gamma") param = {"gamma": [0, 0.5, 1, 1.5, 2, 2.5]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_6 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_6.best_params_["gamma"] print("gamma:%s,用时:%s" % (best_param, (t1 - t0))) # 7-精确搜索:gamma print("7-精确搜索:gamma") if best_param == 0: param = {"gamma": [0, 0.1, 0.2, 0.3, 0.4]} else: param = {"gamma": np.arange(best_param - 0.2, best_param + 0.3, 0.1)} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_7 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_7.best_params_["gamma"] reg.set_params(gamma=best_param) xgb_param = reg.get_xgb_params() print("gamma:%s,用时:%s" % (best_param, (t1 - t0))) # 8-调整最优弱分类器个数:n_estimators_2 print("8-调整最优弱分类器个数:n_estimators_2") reg.set_params(n_estimators=n_estimators_0) xgb_param = reg.get_xgb_params() t0 = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_2 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_2) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_2, (t1 - t0))) # 9-暴力搜索:subsample, colsample_bytree print("9-暴力搜索:subsample, colsample_bytree") param = { "subsample": [0.6, 0.7, 0.8, 0.9], "colsample_bytree": [0.6, 0.7, 0.8, 0.9] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_8 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_8.best_params_["subsample"] best_param_2 = model_8.best_params_["colsample_bytree"] print("subsample:%s,colsample_bytree:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 10-精确搜索:subsample, colsample_bytree print("10-精确搜索:subsample, colsample_bytree") param = { "subsample": [best_param_1 - 0.05, best_param_1, best_param_1 + 0.05], "colsample_bytree": [best_param_2 - 0.05, best_param_2, best_param_2 + 0.05] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_9 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_9.best_params_["subsample"] best_param_2 = model_9.best_params_["colsample_bytree"] reg.set_params(subsample=best_param_1, colsample_bytree=best_param_2) xgb_param = reg.get_xgb_params() print("subsample:%s,colsample_bytree:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 11-暴力搜索:reg_alpha print("11-暴力搜索:reg_alpha") param = {"reg_alpha": [0, 1, 2, 3]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_11 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_11.best_params_["reg_alpha"] reg.set_params(reg_alpha=best_param) xgb_param = reg.get_xgb_params() print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0))) # 12-精确搜索:reg_alpha print("12-精确搜索:reg_alpha") if best_param == 0: param = {"reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5]} else: param = { "reg_alpha": np.arange(best_param - 0.5, best_param + 0.5, 0.2) } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_12 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_12.best_params_["reg_alpha"] reg.set_params(reg_alpha=best_param) xgb_param = reg.get_xgb_params() print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0))) # 13-暴力搜索:reg_lambda print("13-暴力搜索:reg_lambda") param = {"reg_lambda": [1, 3, 5, 7]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_13 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_13.best_params_["reg_lambda"] reg.set_params(reg_lambda=best_param) xgb_param = reg.get_xgb_params() print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0))) # 14-精确搜索:reg_lambda print("14-精确搜索:reg_lambda") param = {"reg_lambda": np.arange(best_param - 1, best_param + 1, 0.2)} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_14 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_14.best_params_["reg_lambda"] reg.set_params(reg_lambda=best_param) xgb_param = reg.get_xgb_params() print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0))) # 15-精确搜索:max_delta_step, scale_pos_weight print("15-精确搜索:max_delta_step, scale_pos_weight") param = {"max_delta_step": [0, 1, 3, 5], "scale_pos_weight": [1, 3, 5, 7]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_12 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_12.best_params_["max_delta_step"] best_param_2 = model_12.best_params_["scale_pos_weight"] reg.set_params(max_delta_step=best_param_1, scale_pos_weight=best_param_2) xgb_param = reg.get_xgb_params() print("max_delta_step:%s,scale_pos_weight:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 16-调整最优弱分类器个数:n_estimators_3 print("16-调整最优弱分类器个数:n_estimators_3") reg.set_params(n_estimators=n_estimators_0) xgb_param = reg.get_xgb_params() t0 = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_3 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_3) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_3, (t1 - t0))) # 17-精确搜索:learning_rate print("17-精确搜索:learning_rate") lr = xgb_param["learning_rate"] param = {"learning_rate": [lr - 0.05, lr, lr + 0.05]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_16 = reg_gscv.fit(x_train, y_train) t_1 = pd.Timestamp.now() best_param = model_16.best_params_["learning_rate"] reg.set_params(learning_rate=best_param) xgb_param = reg.get_xgb_params() print("learning_rate:%s,用时:%s" % (best_param, (t_1 - t0))) # 18-终极训练 print("18-终极训练") model_res = xgb.train(params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], evals=watchlist, feval=rmspe_xg, early_stopping_rounds=int(xgb_param["n_estimators"] / esr)) t_end = pd.Timestamp.now() print("参数训练完毕,总用时:%s" % (t_end - t_begin)) return model_res, reg
#######################################################cv调参 xgb1 = XGBRegressor(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth': [3, 4, 5, 6, 7], 'min_child_weight': [3, 4, 5, 6, 7] } gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=1000, gamma=0,
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)
gamma=0, subsample=0.7, colsample_bytree=0.7, nthread=4, scale_pos_weight=1, seed=27) grid = GridSearchCV(estimator=xgb_best, param_grid=param_test, cv=5) grid.fit(source_X, source_y) grid.grid_scores_ grid.best_estimator_ xgb_best.fit(train_X, train_y) xgb_best.score(test_X, test_y) print(xgb_best.score(test_X, test_y)) xgb_param = xgb_best.get_xgb_params() xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=15, metrics=['auc'], early_stopping_rounds=50, stratified=True, seed=1301) full_xy = pd.concat([source_X, source_y], axis=1) target = 'count' def modelfit(alg, dtrain,
min_child_weight=3, gamma=0, reg_alpha=100, subsample=0.8, colsample_bytree=0.7, objective='reg:linear', base_score=0.5, nthread=-1, scale_pos_weight=1, silent=0) #xgbParams = {'booster':'gbtree','objective':'reg:linear','gamma':0,'max_depth':5,'lambda':100,'subsample':0.8,'colsample_bytree':0.7,'min_child_weight':3,'eta':0.1} dtrain = xgb.DMatrix(train[train_features].values, label=np.log1p(train[target].values)) watchlist = [(dtrain, 'train')] model = xgb.train(xgbParams.get_xgb_params(), dtrain, 5000, watchlist, early_stopping_rounds=50) model.save_model('./../sberbank.model') #model = xgb.Booster({'nthread':-1}) #model.load_model('./../sberbank.model') dtest = xgb.DMatrix(test[test_features].values) pred = model.predict(dtest) answer = pd.DataFrame({'id': test[IDCol], 'price_doc': np.expm1(pred)}) answer.to_csv('./../answer.csv') #gsearch = GridSearchCV(estimator = xgbParams, param_grid = param,scoring = 'neg_mean_squared_error', cv=5) #gsearch.fit(train[train_features], train[target]) #print("best_params: %s"%gsearch.best_params_) #print('best_score: %s'%gsearch.best_score_)
subsample=0.8, learning_rate=0.05, random_state=42) param_xgb = {'min_child_weight': [1, 2, 3, 4, 5]} xgbc = model_fit(xgbc, xtrain, ytrain_casual, param_xgb, False) xgbc = model_fit(xgbc, xtrain, ytrain_registered, param_xgb, False) ypred_xgb_count = Test_Set_Report("XGBoost", xgbc) result_xgb = pd.concat([ytest_count, pd.Series(ypred_xgb_count)], axis=1) # xgb.cv is used to get the actual number of n_estimators required based on the learning rate, # it uses early_stopping_rounds to get the optimal value xdtrain = xgb.DMatrix(xtrain, label=ytrain_casual) cvresult_xgb = xgb.cv(xgbc.get_xgb_params(), xdtrain, nfold=5, num_boost_round=5000, metrics='rmse', early_stopping_rounds=50) bestpred_xgb = print_feature_importance(xgbc) ''' Storing the predicted results along with the actual prediction for each phone number in a csv file''' FinalResult_Python = FinalResult_Python.reset_index() FinalResult_Python.drop(columns=['index'], axis=1, inplace=True) FinalResult_Python = pd.concat([FinalResult_Python, result_xgb[0]], axis=1) FinalResult_Python.rename(columns={0: 'Predicted Count'}, inplace=True) FinalResult_Python.to_csv("PredictedRentalCount_Python.csv", index=False)
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0 / sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0 / len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', scale_pos_weight=1, seed=123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor(n_estimators=num_training_rounds, objective='reg:linear') # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [ x for x in self.df.columns if x not in [self.target_column, self.id_column] ] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) + ', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len( current_df[self.target_column].unique()) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) print('fitting model') self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict( current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba( current_df[self.predictors])[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score( current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score( current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt( metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: if self.target_type == 'binary': output = self.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = self.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb_load = self.load(filename) if self.target_type == 'binary': output = xgb_load.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = xgb_load.clf.predict( self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list( self.clf.booster().get_fscore().items()), key=operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([ df, pd.get_dummies( df[col]).rename(columns=lambda x: col + '_' + str(x)) ], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt / float( len(df[col]) ) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if col is not self.target_column: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[ col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = list(range(0, num_rows)) print('INDICES', indices) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s], :]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
scale_pos_weight=1, seed=27) # In[ ]: # results = cross_val_score(xgb_model, train_df_drop, log_target_df, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error', verbose=True) # results = np.sqrt( np.abs( results ) ) #MSLE TO RMSLE # In[ ]: # print(np.sqrt(np.abs(results))) # In[ ]: xgtrain = xgb.DMatrix(train_df_drop, log_target_df) xgb_param = xgb_model.get_xgb_params() # In[ ]: results = xgb.cv(xgb_param, xgtrain, num_boost_round=n_estimators, nfold=cv, metrics='rmse', early_stopping_rounds=early_stopping_rounds, verbose_eval=20) # In[ ]: xgb_model.fit(X_train, y_train,
watchlist = [(d_train, "train"), (d_valid, "valid")] # model reg = XGBRegressor(booster="gbtree", silent=1, n_jobs=-1, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, objective=objective, eval_metric=eval_metric, seed=0) xgb_param = reg.get_xgb_params() model_allstores = xgb.train(params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], evals=watchlist, feval=fd.rmspe_xg, early_stopping_rounds=1000) f = open(txt_path + "model_allstores.txt", "wb") pickle.dump(model_allstores, f) f.close() # 模型验证 y_hat_valid = model_allstores.predict(d_valid) y_hat_valid = np.expm1(y_hat_valid).astype(np.int64) fd.rmspe(np.expm1(y_valid), y_hat_valid) # 验证集rmspe:0.13831
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth':[3,4,5,6,7], 'min_child_weight':[3,4,5,6,7] } gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1,