def def_model(self, parameters: dict = None): model = XGBRegressor() if parameters is not None: model.set_params(**parameters) self._model = model
class XGBWrapper_regr(object): """ A wrapper for xgboost model so that we will have a single api for various models. """ def __init__(self): self.model = XGBRegressor() def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None): self.model = self.model.set_params(**params) eval_set = [(X_train, y_train)] if X_valid is not None: eval_set.append((X_valid, y_valid)) if X_holdout is not None: eval_set.append((X_holdout, y_holdout)) self.model.fit(X=X_train, y=y_train, eval_set=eval_set, eval_metric='rmse', verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds']) scores = self.model.evals_result() self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()} # self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()} self.feature_importances_ = self.model.feature_importances_ def predict(self, X_test): return self.model.predict(X_test)
def objective(params): (learning_rate, max_depth, n_estimators) = params if (learner_choice == 'xgbR'): learner = XGBRegressor() elif (learner_choice == 'xgbC'): learner = XGBClassifier() learner.set_params(booster='gbtree', learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, subsample=0.75) return -mean( cross_val_score(learner, X, y, cv=cv_folds, n_jobs=-1, scoring="neg_mean_absolute_error"))
def xgb(X_train, X_test, y_train, y_test): mod = XGBRegressor(learning_rate=0.2, objective='reg:squarederror') estimators = np.arange(1, 200, 10) scores = [] estim = [] for n in estimators: mod.set_params(n_estimators=n) mod.fit(X_train, y_train) scores.append(mod.score(X_test, y_test)) estim.append(n) xdf = pd.DataFrame({'Estimator':estim, 'Score':scores}) best = next((x for x in xdf['Estimator'][xdf['Score'] == max(xdf['Score'])]), None) xgbr = XGBRegressor(n_estimators=best, learning_rate=0.2, objective='reg:squarederror') xgbr.fit(X_train, y_train) return xgbr
def train_model(X_train, y_train, X_test, y_test, estimator): """ This function performs the training of the model. :param df_train: The dataframe with the train data set. :param df_test: The dataframe with the test data set. :return: model: Returns the trained model which can be used to get predictions. """ logger.info("Start train_model()") model = None if estimator == 'DecisionTreeRegressor': model = DecisionTreeRegressor() model.fit(X_train, y_train) if estimator == 'SGDRegressor': model = MultiOutputRegressor(SGDRegressor()) model.fit(X_train, y_train) if estimator == 'GradientBoostingRegressor': model = MultiOutputRegressor(GradientBoostingRegressor()) model.fit(X_train, y_train) if estimator == 'XGBRegressor': best_params = { 'colsample_bytree': 0.5, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 50, 'nthread': -1, # 'num_boost_round': 45, 'objective': 'reg:squarederror' } model_xgb = XGBRegressor(n_jobs=-1) model_xgb.set_params(**best_params) model = MultiOutputRegressor(model_xgb) model.fit(X_train, y_train) return model
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)
subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth':[3,4,5,6,7], 'min_child_weight':[3,4,5,6,7] } gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=1000, gamma=0, subsample=0.8,
gamma=0.15, reg_alpha=2.5, reg_lambda=10, subsample=1, colsample_bytree=0.1, colsample_bylevel=0.1, objective= 'reg:logistic', nthread=-1, scale_pos_weight=1, tree_method= 'gpu_exact', gpu_id= 0, seed=0) print('start cv') result=xgb.cv(params, xgb.DMatrix(train_X,label=train_y), num_boost_round=1000, nfold=8, stratified=False, maximize=False, early_stopping_rounds=10,as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True) model.set_params(n_estimators=int(result.shape[0])) model.fit(train_X, train_y) print('start predict') y=[] X=train_X[-1,:].reshape(1,train_X.shape[1]) X = np.concatenate((X, train_y[-1].reshape(1, 1)), axis=1) for i in range(len(test)): X=np.concatenate((X[:,3:],test[i].reshape(1,2)), axis=1).reshape((1, train_X.shape[1])) y.append(model.predict(X)[0]*(1+i/len(test))) X = np.concatenate((X.reshape(1,train_X.shape[1]), y[-1].reshape(1, 1)), axis=1) print(y) # yhat = model.predict(test_X) # print(np.shape(yhat)) # test_X = test_X.reshape((test_X.shape[0], test_X.shape[2])) # # invert scaling for forecast inv_yhat = scaler.inverse_transform(y)
def instantiate_model(self, params): model = XGBRegressor() model.set_params(**params) return model
'min_child_weight':range(4,11,1) } gsearch2 = GridSearchCV(estimator = xgb1, param_grid = param_test2, scoring='r2', n_jobs=-1, iid=False, cv=5) gsearch2.fit(train[predictors],y) gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ # max_depth: 2, min_child_weight: 9 param_test2b = { 'min_child_weight':range(19, 31) } xgb1.set_params(max_depth = 2) gsearch2b = GridSearchCV(estimator = xgb1, param_grid = param_test2b, scoring='r2', n_jobs=-1, iid=False, cv=5) gsearch2b.fit(train[predictors],y) gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_ # max_depth = 2, min_child_weight = 27 param_test2c = { 'max_depth': [2,3,4,5], 'min_child_weight':range(4,31) } gsearch2c = GridSearchCV(estimator = xgb1,
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
def xgbm_model_fit(random_search_flag, x_train, y_train, x_test, y_test, modeltype, multi_label, log_y, num_boost_round=100): start_time = time.time() if multi_label and not random_search_flag: model = num_boost_round else: rand_params = { 'learning_rate': sp.stats.uniform(scale=1), 'gamma': sp.stats.randint(0, 100), 'n_estimators': sp.stats.randint(100, 500), "max_depth": sp.stats.randint(3, 15), } if modeltype == 'Regression': objective = 'reg:squarederror' eval_metric = 'rmse' shuffle = False stratified = False num_class = 0 score_name = 'Score' scale_pos_weight = 1 else: if modeltype == 'Binary_Classification': objective = 'binary:logistic' eval_metric = 'error' ## dont foolishly change to auc or aucpr since it doesnt work in finding feature imps later shuffle = True stratified = True num_class = 1 score_name = 'Error Rate' scale_pos_weight = get_scale_pos_weight(y_train) else: objective = 'multi:softprob' eval_metric = 'merror' ## dont foolishly change to auc or aucpr since it doesnt work in finding feature imps later shuffle = True stratified = True if multi_label: num_class = y_train.nunique().max() else: if isinstance(y_train, np.ndarray): num_class = np.unique(y_train).max() + 1 elif isinstance(y_train, pd.Series): num_class = y_train.nunique() else: num_class = y_train.nunique().max() score_name = 'Multiclass Error Rate' scale_pos_weight = 1 ### use sample_weights in multi-class settings ## ###################################################### final_params = { 'booster': 'gbtree', 'colsample_bytree': 0.5, 'alpha': 0.015, 'gamma': 4, 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 2, 'reg_lambda': 0.5, 'subsample': 0.7, 'random_state': 99, 'objective': objective, 'eval_metric': eval_metric, 'verbosity': 0, 'n_jobs': -1, 'scale_pos_weight': scale_pos_weight, 'num_class': num_class, 'silent': True } ####### This is where we split into single and multi label ############ if multi_label: ###### This is for Multi_Label problems ############ rand_params = { 'estimator__learning_rate': [0.1, 0.5, 0.01, 0.05], 'estimator__n_estimators': [50, 100, 150, 200, 250], 'estimator__gamma': [2, 4, 8, 16, 32], 'estimator__max_depth': [3, 5, 8, 12], } if random_search_flag: if modeltype == 'Regression': clf = XGBRegressor(n_jobs=-1, random_state=999, max_depth=6) clf.set_params(**final_params) model = MultiOutputRegressor(clf, n_jobs=-1) else: clf = XGBClassifier(n_jobs=-1, random_state=999, max_depth=6) clf.set_params(**final_params) model = MultiOutputClassifier(clf, n_jobs=-1) if modeltype == 'Regression': scoring = 'neg_mean_squared_error' else: scoring = 'precision' model = RandomizedSearchCV(model, param_distributions=rand_params, n_iter=15, return_train_score=True, random_state=99, n_jobs=-1, cv=3, refit=True, scoring=scoring, verbose=False) model.fit(x_train, y_train) print( 'Time taken for Hyper Param tuning of multi_label XGBoost (in minutes) = %0.1f' % ((time.time() - start_time) / 60)) cv_results = pd.DataFrame(model.cv_results_) print('Mean cross-validated test %s = %0.04f' % (score_name, cv_results['mean_test_score'].mean())) ### In this case, there is no boost rounds so just return the default num_boost_round return model.best_estimator_ else: try: model.fit(x_train, y_train) except: print( 'Multi_label XGBoost model is crashing during training. Please check your inputs and try again...' ) return model else: #### This is for Single Label Problems ############# if modeltype == 'Multi_Classification': wt_array = get_sample_weight_array(y_train) dtrain = xgb.DMatrix(x_train, label=y_train, weight=wt_array) else: dtrain = xgb.DMatrix(x_train, label=y_train) ######## Now let's perform randomized search to find best hyper parameters ###### if random_search_flag: cv_results = xgb.cv(final_params, dtrain, num_boost_round=num_boost_round, nfold=5, stratified=stratified, metrics=eval_metric, early_stopping_rounds=10, seed=999, shuffle=shuffle) # Update best eval_metric best_eval = 'test-' + eval_metric + '-mean' mean_mae = cv_results[best_eval].min() boost_rounds = cv_results[best_eval].argmin() print("Cross-validated %s = %0.3f in num rounds = %s" % (score_name, mean_mae, boost_rounds)) print( 'Time taken for Hyper Param tuning of XGBoost (in minutes) = %0.1f' % ((time.time() - start_time) / 60)) return boost_rounds else: try: model = xgb.train( final_params, dtrain, num_boost_round=num_boost_round, verbose_eval=False, ) except: print( 'XGBoost model is crashing. Please check your inputs and try again...' ) return model
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0 / sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0 / len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', scale_pos_weight=1, seed=123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor(n_estimators=num_training_rounds, objective='reg:linear') # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [ x for x in self.df.columns if x not in [self.target_column, self.id_column] ] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) + ', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len( current_df[self.target_column].unique()) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) print('fitting model') self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict( current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba( current_df[self.predictors])[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score( current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score( current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt( metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: if self.target_type == 'binary': output = self.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = self.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb_load = self.load(filename) if self.target_type == 'binary': output = xgb_load.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = xgb_load.clf.predict( self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list( self.clf.booster().get_fscore().items()), key=operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([ df, pd.get_dummies( df[col]).rename(columns=lambda x: col + '_' + str(x)) ], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt / float( len(df[col]) ) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if col is not self.target_column: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[ col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = list(range(0, num_rows)) print('INDICES', indices) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s], :]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
'max_depth': [1,2,3,4], 'min_child_weight':range(4,12,1) } gsearch2 = GridSearchCV(estimator = xgb1, param_grid = param_test2, scoring='r2', n_jobs=-1, iid=False, cv=5) gsearch2.fit(train[predictors],train['y']) gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ param_test2b = { 'min_child_weight':range(11, 30) } xgb1.set_params(max_depth = 3) gsearch2b = GridSearchCV(estimator = xgb1, param_grid = param_test2b, scoring='r2', n_jobs=-1, iid=False, cv=5) gsearch2b.fit(train[predictors],train['y']) gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_ # max_depth = 3, min_child_weight = 17 xgb1.set_params(min_child_weight = 17) param_test3 = { 'gamma': [x / 10 for x in range(11, 30)] }
# step 2.2: fine tune max_depth, min_child_weight param_test2 = {'max_depth': [2, 3, 4], 'min_child_weight': range(4, 7, 1)} gs2 = GridSearchCV(xgb1, param_grid=param_test2, scoring='neg_mean_squared_error', n_jobs=-1, iid=False, cv=outer_cv) gs2.fit(X, y) print_grid_scores(gs2) print('Best parameters: %r' % gs2.best_params_) print('Best mean test RMSE: %.4f' % (np.sqrt(-gs2.best_score_))) xgb1.set_params(max_depth=3, min_child_weight=4) # step 3: tune gamma #param_test3 = { # 'gamma':[i/10.0 for i in range(0,5)] #} #param_test3 = { # 'gamma':[i/100.0 for i in range(0,10)] #} param_test3 = {'gamma': range(6)} gs3 = GridSearchCV(xgb1, param_grid=param_test3, scoring='neg_mean_squared_error',
reg_lambda=1, # [默认是1] 权重的L2正则化项 max_depth=10, # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10 min_child_weight= 1, # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。 n_jobs=1) """ dtrain = xgb.DMatrix(X_train, y_train) xgb_params = clf.get_xgb_params() cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000, early_stopping_rounds=50) #clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0]) #fscore = clf_xgb.get_fscore() #print(cvresult.shape[0], fscore) print(cvresult.shape[0]) """ clf.set_params(n_estimators=28) """ param_test1 = { 'max_depth': [i for i in range(3, 12, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } best_max_depth = 5 best_min_child_weight = 1 param_test2 = { 'max_depth': [best_max_depth-1,best_max_depth,best_max_depth+1], 'min_child_weight': [best_min_child_weight,best_min_child_weight+1] } """ clf.set_params(max_depth=5, min_child_weight=1) """ param_test3 = {
gamma=0.1, reg_alpha=2.5, reg_lambda=5, subsample=0.8, colsample_bytree=0.5, objective= 'reg:logistic', nthread=-1, scale_pos_weight=1, silent=True, tree_method= 'gpu_exact', gpu_id= 0, seed=0) X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2, random_state=0) result=xgb.cv(params, xgb.DMatrix(X,label=y), num_boost_round=1000, nfold=8, stratified=False, folds=None,maximize=False, early_stopping_rounds=10,as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True,feval=loss) xgb1.set_params(n_estimators=result.shape[0]) xgb1.fit(X_train, Y_train) preds=xgb1.predict(X_train)*(maxy-miny)+miny # preds=preds+(preds-np.mean(preds))*0.5 cha=(preds - Y_train*(maxy-miny)-miny) print('train',np.dot(cha,cha.T)/len(cha)) preds=xgb1.predict(X_test)*(maxy-miny)+miny # preds=preds+(preds-np.mean(preds))*0.5 cha=(preds - Y_test*(maxy-miny)-miny) print('test',np.dot(cha,cha.T)/len(cha)) print(np.min(preds),np.max(preds),np.mean(preds))
def run_find(x_train, y_train, i, x_predict): # 找到合适的参数调优的估计器数目 clf = XGBRegressor( objective='reg:linear', learning_rate=0.1, # [默认是0.3]学习率类似,调小能减轻过拟合,经典值是0.01-0.2 gamma= 0, # 在节点分裂时,只有在分裂后损失函数的值下降了,才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。这个参数值越大,算法越保守。 subsample=0.8, # 随机采样比例,0.5-1 小欠拟合,大过拟合 colsample_bytree=0.8, # 训练每棵树时用来训练的特征的比例 reg_alpha=1, # [默认是1] 权重的L1正则化项 reg_lambda=1, # [默认是1] 权重的L2正则化项 max_depth=10, # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10 min_child_weight= 1, # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。 ) nums, fscore = modelfit(clf, x_train, y_train, cv_folds=5, early_stopping_rounds=30, feval=evalerror) print('test_estimators:', nums) clf.set_params(n_estimators=nums) # 1 先对 max_depth和min_child_weight 这两个比较重要的参数进行调优 ## 粗调: param_test1 = { 'max_depth': [i for i in range(3, 12, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } best_params, best_score = find_params(param_test1, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调: max_d = best_params['max_depth'] min_cw = best_params['min_child_weight'] param_test2 = { 'max_depth': [max_d - 1, max_d, max_d + 1], 'min_child_weight': [min_cw - 1, min_cw, min_cw + 1] } best_params, best_score = find_params(param_test2, clf, x_train, y_train) clf.set_params(max_depth=best_params['max_depth'], min_child_weight=best_params['min_child_weight']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 2 对 gamma 进行调参: ## 粗调: param_test3 = {'gamma': [i / 10.0 for i in range(0, 10, 2)]} best_params, best_score = find_params(param_test3, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调: b_gamma = best_params['gamma'] param_test4 = {'gamma': [b_gamma, b_gamma + 0.1, b_gamma + 0.2]} best_params, best_score = find_params(param_test4, clf, x_train, y_train) clf.set_params(gamma=best_params['gamma']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 3 对subsample和colsample_bytree进行调参 ## 粗调 param_test5 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } best_params, best_score = find_params(param_test5, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调 b_subsample = best_params['subsample'] b_colsample_bytree = best_params['colsample_bytree'] param_test6 = { 'subsample': [b_subsample - 0.05, b_subsample, b_subsample + 0.05], 'colsample_bytree': [ b_colsample_bytree - 0.05, b_colsample_bytree, b_colsample_bytree + 0.05 ] } best_params, best_score = find_params(param_test6, clf, x_train, y_train) clf.set_params(subsample=best_params['subsample'], colsample_bytree=best_params['colsample_bytree']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 4 对 reg_alpha和lambda 进行调节 ## 粗调 param_test7 = { 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 2], 'reg_lambda': [1e-5, 1e-2, 0.1, 1, 2] } best_params, best_score = find_params(param_test7, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调 b_alp = best_params['reg_alpha'] b_lam = best_params['reg_lambda'] param_test8 = { 'reg_alpha': [b_alp, 2 * b_alp, 3 * b_alp], 'reg_lambda': [b_lam, 2 * b_lam, 3 * b_lam] } best_params, best_score = find_params(param_test7, clf, x_train, y_train) clf.set_params(reg_alpha=best_params['reg_alpha'], reg_lambda=best_params['reg_lambda']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 5 调小learning_rate, 提高迭代次数 clf.set_params(learning_rate=0.01) nums, fscore = modelfit(clf, x_train, y_train, cv_folds=5, early_stopping_rounds=50, feval=evalerror) clf.set_params(n_estimators=nums) clf.fit(x_train, y_train) y_predict = clf.predict(x_predict) return y_predict, fscore
'min_child_weight':range(1,6,2) } grid_search(xgb1, param_test1) # max_depth: 3, min_child_weight: 5 param_test2 = { 'max_depth': [1,2,3,4], 'min_child_weight':range(4,10) } grid_search(xgb1, param_test2) # max_depth: 3, min_child_weight: 4 param_test2b = { 'min_child_weight':range(19, 31) } xgb1.set_params(max_depth = 2) gsearch2b = GridSearchCV(estimator = xgb1, param_grid = param_test2b, scoring='r2', n_jobs=-1, iid=False, cv=5) gsearch2b.fit(train[predictors],y) gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_ # max_depth = 2, min_child_weight = 27 param_test2c = { 'max_depth': [2,3,4,5], 'min_child_weight':range(4,31) } gsearch2c = GridSearchCV(estimator = xgb1,
def get_XgbRegressor(train_data, train_target, test_data, feature_names, parameters, early_stopping_rounds, num_folds, eval_metric, model_name='model', stratified=False): ''' :param train_data: 一定是numpy :param train_target: :param parameters: :param round: :param k: :param eval_metrics:自定义 or 内置字符串 :return: ''' reg = XGBRegressor() reg.set_params(**parameters) # 定义一些变量 oof_preds = np.zeros((train_data.shape[0], )) sub_preds = np.zeros((test_data.shape[0], )) feature_importance_df = pd.DataFrame() cv_result = [] # K-flod if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234) X_train_newfeature = np.zeros((1, 1)) for n_flod, (train_index, val_index) in enumerate(folds.split(train_data, train_target)): train_X = train_data[train_index] val_X = train_data[val_index] train_Y = train_target[train_index] val_Y = train_target[val_index] # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果, # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。 watchlist = [(train_X, train_Y), (val_X, val_Y)] # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate reg.fit(train_X, train_Y, early_stopping_rounds=early_stopping_rounds, eval_set=watchlist, eval_metric=eval_metric) ## 生成gbdt新特征 new_feature = reg.apply(val_X) if X_train_newfeature.shape[0] == 1: X_train_newfeature = mergeToOne(val_X, new_feature) else: X_train_newfeature = mergeToOne(val_X, new_feature) X_train_newfeature = np.concatenate( (X_train_newfeature, mergeToOne(new_feature, val_X)), axis=0) print(X_train_newfeature) # 获得每次的预测值补充 oof_preds[val_index] = reg.predict(val_X) # 获得预测的平均值,这里直接加完再除m sub_preds += reg.predict(test_data) result = mean_absolute_error(val_Y, reg.predict(val_X)) print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result)) cv_result.append(round(result, 5)) gc.collect() # 默认就是gain 如果要修改要再参数定义中修改importance_type # 保存特征重要度 gain = reg.feature_importances_ fold_importance_df = pd.DataFrame({ 'feature': feature_names, 'gain': 100 * gain / gain.sum(), 'fold': n_flod, }).sort_values('gain', ascending=False) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) # 进行保存 sub_preds = sub_preds / folds.n_splits new_feature = reg.apply(test_data) X_test_newfeature = mergeToOne(test_data, new_feature) if not os.path.isdir('./sub'): os.makedirs('./sub') pd.DataFrame(oof_preds, columns=['class' ]).to_csv('./sub/val_{}.csv'.format(model_name), index=False) pd.DataFrame(sub_preds, columns=['class' ]).to_csv('./sub/test_{}.csv'.format(model_name), index=False) print('cv_result', cv_result) if not os.path.isdir('./gbdt_newfeature'): os.makedirs('./gbdt_newfeature') np.save("./gbdt_newfeature/train_newfeature.npy", X_train_newfeature) np.save("./gbdt_newfeature/test_newfeature.npy", X_test_newfeature) save_importances(feature_importance_df, model_name) return reg, sub_preds
def cross_validation(dtrain, ytrain, predictors): #每次调整完一个参数,重新确定新的num_rounds dtrain = dtrain[predictors] xgb_model = XGBRegressor( learning_rate=0.5, max_depth=20, n_estimators=100, min_child_weight=1, gamma=0, objective='reg:linear', nthread=4, ) modelfit(xgb_model, dtrain, ytrain) print('tunning learning rate...') params = {'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(learning_rate=gsearch.best_params_['learning_rate']) print(gsearch.best_params_) print('tunning max_depth...') params = {'max_depth': [3, 5, 7, 9]} print(xgb_model.get_params()['n_estimators']) gsearch = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(max_depth=gsearch.best_params_['max_depth']) print(gsearch.best_params_) #choose best num_round modelfit(xgb_model, dtrain, ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning min_child_weight...') param_child_weight = {'min_child_weight': [1, 3, 5, 7]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_child_weight, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params( min_child_weight=gsearch.best_params_['min_child_weight']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning gamma...') param_gamma = {'gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_gamma, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(gamma=gsearch.best_params_['gamma']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print(xgb_model.get_params()['n_estimators']) #print('tunning colsample_bylevel') #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]} #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) #gsearch.fit(dtrain.values,ytrain) #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel']) #tunning colsample_bytree print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print('num_rounds after tunning colsample_bylevel:%f' % xgb_model.get_params()['n_estimators']) print('tunning colsample_bytree...') param_colsample_bytree = {'colsample_bytree': [0.6, 0.7, 0.8, 1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_colsample_bytree, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params( colsample_bytree=gsearch.best_params_['colsample_bytree']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print('num_rounds after tunning colsample_bytree:%f' % xgb_model.get_params()['n_estimators']) # save and return model cur_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) pickle.dump( xgb_model, open('../models/autogridsearch_xgb_' + cur_time + '.model', 'wb')) cv_score(xgb_model, dtrain.values, ytrain) return xgb_model
reg_lambda=1, # [默认是1] 权重的L2正则化项 max_depth=10, # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10 min_child_weight=1, # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。 n_jobs=1 ) """ dtrain = xgb.DMatrix(X_train, y_train) xgb_params = clf.get_xgb_params() cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000, early_stopping_rounds=50) #clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0]) #fscore = clf_xgb.get_fscore() #print(cvresult.shape[0], fscore) print(cvresult.shape[0]) """ clf.set_params(n_estimators=10) """ param_test1 = { 'max_depth': [i for i in range(3, 12, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } best_max_depth = 3 best_min_child_weight = 9 param_test2 = { 'max_depth': [i for i in range(3, 12, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } """ clf.set_params(max_depth=3,min_child_weight=9) """ param_test3 = {
def fit(self, inputs_train, labels_train, fit_options={}): xgb_reg = XGBRegressor(random_state=self.options['seed']) print('Starting with low learning rate and tuning: \ max_depth, min_child_weight, n_estimators') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta # np.arange(2,14,2), "max_depth": self.options['max_depth'], # np.arange(1,7,6), "min_child_weight": self.options['min_child_weight'], # np.arange(10,80,10), "n_estimators": self.options['n_estimators'], "colsample_bytree": [0.8], "subsample": [0.8], "gamma": [0], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: gamma') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "colsample_bytree": [0.8], "subsample": [0.8], # np.arange(0.05,0.45,0.05), "gamma": self.options['gamma'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: colsample_bytree, subsample') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], # np.arange(0.60, 0.95, 0.05), "colsample_bytree": self.options['colsample_bytree'], # np.arange(0.60, 0.95, 0.05), "subsample": self.options['subsample'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: reg_alpha, reg_lambda') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], "colsample_bytree": [GSCV.best_params_['colsample_bytree']], "subsample": [GSCV.best_params_['subsample']], # ,[1e-5, 1e-2, 0.1, 1, 10], #alpha "reg_alpha": self.options['reg_alpha'], # [1e-5, 1e-2, 0.1, 1, 10],#lambda "reg_lambda": self.options['reg_lambda'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: learning_rate') params = { # np.arange(0.025,0.150,0.025), #np.arange(0.05,0.45,0.05), #eta "learning_rate": self.options['learning_rate'], "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], "colsample_bytree": [GSCV.best_params_['colsample_bytree']], "subsample": [GSCV.best_params_['subsample']], "reg_alpha": [GSCV.best_params_['reg_alpha']], # alpha "reg_lambda": [GSCV.best_params_['reg_lambda']] # lambda } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Final model') # Regression regressor = XGBRegressor(random_state=self.options['seed']) # seed) regressor.set_params(**GSCV.best_params_) trained_regressor = regressor.fit(inputs_train, labels_train) self.regressor = trained_regressor self.feature_importances_ = self.regressor.feature_importances_
class XGBoost(BaseModel): """XGBoost Class.""" def __init__(self, XGBoost_objective, tuning_metric, trials='trials', bottom_coding=None, transform=None, **kwargs): """Initialize hyperparameters.""" super(XGBoost, self).__init__(bottom_coding=bottom_coding, transform=transform) self.model = XGBRegressor self.tuning_metric = tuning_metric self.objective = XGBoost_objective self.trials = Trials() \ if trials == 'trials' \ else MongoTrials('mongo://localhost:1234/foo_db/jobs', exp_key='exp1') self.set_parameters() def set_parameters(self): self.space = { 'n_estimators': hp.choice('n_estimators', list(range(100, 5000, 900))), 'max_depth': hp.choice('max_depth', list(range(3, 10, 3))), 'min_child_weight': hp.choice('min_child_weight', list(range(1, 10, 4))), 'subsample': hp.choice('subsample', [i / 100.0 for i in range(75, 100, 10)]), 'gamma': hp.choice('gamma', [i / 10.0 for i in range(0, 5, 2)]), 'colsample_bytree': hp.quniform('colsample_bytree', 0.75, 1, 0.05), 'objective': self.objective, 'booster': 'dart', 'tree_method': 'gpu_exact', 'n_gpu': 1, 'silent': 1, 'learning_rate': 0.1, 'scale_pos_weight': 1 } def tune(self, training_set, logger=None, saver=None): self.training_set = training_set objective = generate_objective(self.training_set, self.model) best = space_eval( self.space, fmin(fn=objective, space=self.space, trials=self.trials, algo=tpe.suggest, max_evals=self.max_evals)) print(f'Best hyperparams: {best}') self.model = XGBRegressor() self.model.set_params(**best) self.model.fit(training_set.X, training_set.y) def instantiate_model(self, params): model = XGBRegressor() model.set_params(**params) return model
def cross_validation(dtrain,ytrain,predictors): #每次调整完一个参数,重新确定新的num_rounds dtrain = dtrain[predictors] xgb_model = XGBRegressor( learning_rate= 0.5, max_depth = 20, n_estimators = 100, min_child_weight = 1, gamma = 0, objective='reg:linear', nthread=4, ) modelfit(xgb_model,dtrain,ytrain) print('tunning learning rate...') params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate']) print(gsearch.best_params_) print('tunning max_depth...') params = { 'max_depth':[3,5,7,9]} print(xgb_model.get_params()['n_estimators']) gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(max_depth = gsearch.best_params_['max_depth']) print(gsearch.best_params_) #choose best num_round modelfit(xgb_model,dtrain,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning min_child_weight...') param_child_weight = {'min_child_weight':[1,3,5,7]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning gamma...') param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(gamma = gsearch.best_params_['gamma']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) #print('tunning colsample_bylevel') #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]} #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) #gsearch.fit(dtrain.values,ytrain) #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel']) #tunning colsample_bytree print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators']) print('tunning colsample_bytree...') param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators']) # save and return model cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime()) pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb')) cv_score(xgb_model,dtrain.values,ytrain) return xgb_model
def train(x_train, y_train, x_valid, y_valid, n_estimators_0, objective, eval_metric, scoring, rmspe_xg, kfold, esr): # 1-设置参数初始值 print("1-设置参数初始值") reg = XGBRegressor( # General Parameters booster="gbtree", silent=1, nthread=-1, n_jobs=-1, # Booster Parameters learning_rate=0.1, n_estimators=n_estimators_0, gamma=0, max_depth=7, min_child_weight=0.001, subsample=0.9, colsample_bytree=0.9, reg_alpha=0, reg_lambda=1, max_delta_step=0, scale_pos_weight=1, # Learning Task Parameters objective=objective, eval_metric=eval_metric, seed=0) # 2-训练最优弱分类器个数:n_estimators_1 print("2-训练最优弱分类器个数:n_estimators_1") xgb_param = reg.get_xgb_params() d_train = xgb.DMatrix(x_train, y_train) d_valid = xgb.DMatrix(x_valid, y_valid) watchlist = [(d_train, "train"), (d_valid, "valid")] t_begin = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_1 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_1) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_1, (t1 - t_begin))) # 3-暴力搜索:learning_rate print("3-暴力搜索:learning_rate") param = {"learning_rate": [0.1, 0.2, 0.3]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_3 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() #model_3.grid_scores_; model_3.best_score_; model_3.best_estimator_ best_param = model_3.best_params_["learning_rate"] reg.set_params(learning_rate=best_param) xgb_param = reg.get_xgb_params() print("learning_rate:%s, 用时:%s" % (best_param, (t1 - t0))) # 4-暴力搜索:max_depth, min_child_weight print("4-暴力搜索:max_depth, min_child_weight") param = { "max_depth": [3, 5, 7, 9, 11], "min_child_weight": [0.001, 0.01, 0.1, 1] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_4 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_4.best_params_["max_depth"] best_param_2 = model_4.best_params_["min_child_weight"] print("max_depth:%s,min_child_weight:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 5-精确搜索:max_depth print("5-精确搜索:max_depth") param = {"max_depth": [best_param_1 - 1, best_param_1, best_param_1 + 1]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_5 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_5.best_params_["max_depth"] reg.set_params(max_depth=best_param_1) xgb_param = reg.get_xgb_params() print("max_depth:%s,用时:%s" % (best_param_1, (t1 - t0))) # 6-暴力搜索:gamma print("6-暴力搜索:gamma") param = {"gamma": [0, 0.5, 1, 1.5, 2, 2.5]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_6 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_6.best_params_["gamma"] print("gamma:%s,用时:%s" % (best_param, (t1 - t0))) # 7-精确搜索:gamma print("7-精确搜索:gamma") if best_param == 0: param = {"gamma": [0, 0.1, 0.2, 0.3, 0.4]} else: param = {"gamma": np.arange(best_param - 0.2, best_param + 0.3, 0.1)} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_7 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_7.best_params_["gamma"] reg.set_params(gamma=best_param) xgb_param = reg.get_xgb_params() print("gamma:%s,用时:%s" % (best_param, (t1 - t0))) # 8-调整最优弱分类器个数:n_estimators_2 print("8-调整最优弱分类器个数:n_estimators_2") reg.set_params(n_estimators=n_estimators_0) xgb_param = reg.get_xgb_params() t0 = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_2 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_2) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_2, (t1 - t0))) # 9-暴力搜索:subsample, colsample_bytree print("9-暴力搜索:subsample, colsample_bytree") param = { "subsample": [0.6, 0.7, 0.8, 0.9], "colsample_bytree": [0.6, 0.7, 0.8, 0.9] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_8 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_8.best_params_["subsample"] best_param_2 = model_8.best_params_["colsample_bytree"] print("subsample:%s,colsample_bytree:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 10-精确搜索:subsample, colsample_bytree print("10-精确搜索:subsample, colsample_bytree") param = { "subsample": [best_param_1 - 0.05, best_param_1, best_param_1 + 0.05], "colsample_bytree": [best_param_2 - 0.05, best_param_2, best_param_2 + 0.05] } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_9 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_9.best_params_["subsample"] best_param_2 = model_9.best_params_["colsample_bytree"] reg.set_params(subsample=best_param_1, colsample_bytree=best_param_2) xgb_param = reg.get_xgb_params() print("subsample:%s,colsample_bytree:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 11-暴力搜索:reg_alpha print("11-暴力搜索:reg_alpha") param = {"reg_alpha": [0, 1, 2, 3]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_11 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_11.best_params_["reg_alpha"] reg.set_params(reg_alpha=best_param) xgb_param = reg.get_xgb_params() print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0))) # 12-精确搜索:reg_alpha print("12-精确搜索:reg_alpha") if best_param == 0: param = {"reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5]} else: param = { "reg_alpha": np.arange(best_param - 0.5, best_param + 0.5, 0.2) } reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_12 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_12.best_params_["reg_alpha"] reg.set_params(reg_alpha=best_param) xgb_param = reg.get_xgb_params() print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0))) # 13-暴力搜索:reg_lambda print("13-暴力搜索:reg_lambda") param = {"reg_lambda": [1, 3, 5, 7]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_13 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_13.best_params_["reg_lambda"] reg.set_params(reg_lambda=best_param) xgb_param = reg.get_xgb_params() print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0))) # 14-精确搜索:reg_lambda print("14-精确搜索:reg_lambda") param = {"reg_lambda": np.arange(best_param - 1, best_param + 1, 0.2)} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_14 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param = model_14.best_params_["reg_lambda"] reg.set_params(reg_lambda=best_param) xgb_param = reg.get_xgb_params() print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0))) # 15-精确搜索:max_delta_step, scale_pos_weight print("15-精确搜索:max_delta_step, scale_pos_weight") param = {"max_delta_step": [0, 1, 3, 5], "scale_pos_weight": [1, 3, 5, 7]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_12 = reg_gscv.fit(x_train, y_train) t1 = pd.Timestamp.now() best_param_1 = model_12.best_params_["max_delta_step"] best_param_2 = model_12.best_params_["scale_pos_weight"] reg.set_params(max_delta_step=best_param_1, scale_pos_weight=best_param_2) xgb_param = reg.get_xgb_params() print("max_delta_step:%s,scale_pos_weight:%s,用时:%s" % (best_param_1, best_param_2, (t1 - t0))) # 16-调整最优弱分类器个数:n_estimators_3 print("16-调整最优弱分类器个数:n_estimators_3") reg.set_params(n_estimators=n_estimators_0) xgb_param = reg.get_xgb_params() t0 = pd.Timestamp.now() xgb_cv = xgb.cv( params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], nfold=kfold, feval=rmspe_xg, #metrics=eval_metric, early_stopping_rounds=int(xgb_param["n_estimators"] / esr), verbose_eval=None) t1 = pd.Timestamp.now() n_estimators_3 = xgb_cv.shape[0] reg.set_params(n_estimators=n_estimators_3) xgb_param = reg.get_xgb_params() print("分类器个数:%s, 用时:%s" % (n_estimators_3, (t1 - t0))) # 17-精确搜索:learning_rate print("17-精确搜索:learning_rate") lr = xgb_param["learning_rate"] param = {"learning_rate": [lr - 0.05, lr, lr + 0.05]} reg_gscv = GridSearchCV(estimator=reg, param_grid=param, scoring=scoring, n_jobs=-1, iid=False, cv=kfold) t0 = pd.Timestamp.now() model_16 = reg_gscv.fit(x_train, y_train) t_1 = pd.Timestamp.now() best_param = model_16.best_params_["learning_rate"] reg.set_params(learning_rate=best_param) xgb_param = reg.get_xgb_params() print("learning_rate:%s,用时:%s" % (best_param, (t_1 - t0))) # 18-终极训练 print("18-终极训练") model_res = xgb.train(params=xgb_param, dtrain=d_train, num_boost_round=xgb_param["n_estimators"], evals=watchlist, feval=rmspe_xg, early_stopping_rounds=int(xgb_param["n_estimators"] / esr)) t_end = pd.Timestamp.now() print("参数训练完毕,总用时:%s" % (t_end - t_begin)) return model_res, reg
'max_depth': [4, 5, 6], # initial best is 5, check around 5 'min_child_weight': range(2, 5, 1) # initial best is 3, check 2,3,4 } gs2 = GridSearchCV(xgb1, param_grid=param_test2, scoring='neg_mean_squared_error', n_jobs=-1, iid=False, cv=outer_cv) gs2.fit(X, y) print_grid_scores(gs2) print('Best parameters: %r' % gs2.best_params_) print('Best mean test RMSE: %.5f' % (np.sqrt(-gs2.best_score_))) xgb1.set_params(max_depth=5, min_child_weight=3) # step 3: tune gamma #param_test3 = { # 'gamma':[i/10.0 for i in range(0,5)] #} param_test3 = {'gamma': [i / 100.0 for i in range(0, 10)]} #param_test3 = { # 'gamma':range(6) #} gs3 = GridSearchCV(xgb1, param_grid=param_test3, scoring='neg_mean_squared_error',
colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth': [3, 4, 5, 6, 7], 'min_child_weight': [3, 4, 5, 6, 7] } gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=1000, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=27), param_grid=param_test1,
reg_lambda=1, # [默认是1] 权重的L2正则化项 max_depth=10, # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10 min_child_weight= 1, # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。 n_jobs=1) """ dtrain = xgb.DMatrix(X_train, y_train) xgb_params = clf.get_xgb_params() cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000, early_stopping_rounds=50) #clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0]) #fscore = clf_xgb.get_fscore() #print(cvresult.shape[0], fscore) print(cvresult.shape[0]) """ clf.set_params(n_estimators=4) """ param_test1 = { 'max_depth': [i for i in range(3, 17, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } best_max_depth = 13 best_min_child_weight = 1 param_test2 = { 'max_depth': [best_max_depth-1,best_max_depth,best_max_depth+1], 'min_child_weight': [best_min_child_weight,best_min_child_weight+1] } """ clf.set_params(max_depth=13, min_child_weight=1) """