max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth': [3, 4, 5, 6, 7], 'min_child_weight': [3, 4, 5, 6, 7] } gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=1000, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma',
d_train = xgb.DMatrix(x_train, y_train) d_test = xgb.DMatrix(x_test, y_test) wathchlist = [(d_train, "train"), (d_test, "test")] reg = XGBRegressor(learning_rate=0.3, n_estimators=100, objective="reg:linear", eval_metric="rmse", min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, n_jobs=-1, nthread=-1, seed=3) params = reg.get_params() evals_res = {} model_sklearn = xgb.train(params=params, dtrain=d_train, evals=wathchlist, evals_result=evals_res, early_stopping_rounds=10, verbose_eval=True) y_pred = model_sklearn.predict(d_test) df_evals = pd.DataFrame({ "loss_train": evals_res.get("train").get("rmse"), "loss_test": evals_res.get("test").get("rmse") }) df_evals.plot() model.loss_reg(y_test, y_pred)
opt.optimize_n_estimators(model, train, predictors, labels) param_test = {'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2)} opt.optimize_params(model, train, predictors, param_test) model.fit(train[predictors], labels) res = model.predict(test[predictors]) print(res) f = open("./run_num.txt", mode='r') run_num = int(f.read()) f.close() f = open("./run_num.txt", mode='w') run_num += 1 f.write(str(run_num)) f.close() ids = range(1461, (1461 + len(res))) result_df = pd.DataFrame({"Id": ids, "SalePrice": res}) result_df.to_csv("./submission_" + str(run_num) + ".csv", index=False) f = open("./params_" + str(run_num) + ".txt", mode='w') f.write(str(model.get_params())) f.write("\n") f.write("tresh_deviation: " + str(_tresh_deviation) + "\n") f.write("tresh_importance: " + str(_tresh_importance) + "\n") f.write("tresh_similarity: " + str(_tresh_similarity) + "\n") f.close()
def cross_validation(dtrain,ytrain,predictors): #每次调整完一个参数,重新确定新的num_rounds dtrain = dtrain[predictors] xgb_model = XGBRegressor( learning_rate= 0.5, max_depth = 20, n_estimators = 100, min_child_weight = 1, gamma = 0, objective='reg:linear', nthread=4, ) modelfit(xgb_model,dtrain,ytrain) print('tunning learning rate...') params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate']) print(gsearch.best_params_) print('tunning max_depth...') params = { 'max_depth':[3,5,7,9]} print(xgb_model.get_params()['n_estimators']) gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(max_depth = gsearch.best_params_['max_depth']) print(gsearch.best_params_) #choose best num_round modelfit(xgb_model,dtrain,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning min_child_weight...') param_child_weight = {'min_child_weight':[1,3,5,7]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning gamma...') param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(gamma = gsearch.best_params_['gamma']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) #print('tunning colsample_bylevel') #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]} #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) #gsearch.fit(dtrain.values,ytrain) #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel']) #tunning colsample_bytree print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators']) print('tunning colsample_bytree...') param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators']) # save and return model cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime()) pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb')) cv_score(xgb_model,dtrain.values,ytrain) return xgb_model
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)
tree_method= 'exact', learning_rate=0.1, n_estimators=100, nthread=4, scale_pos_weight=1, reg_alpha=0.001, seed=27), param_grid = param_test1, scoring=None,n_jobs=4,iid=False, cv=5) gsearch1.fit(train_X,train_Y) print(gsearch1.best_params_, gsearch1.best_score_) #fit model all_set = xgb.DMatrix(feature_df,label=label_df) XGBmodel=xgb.train(XGBmodel.get_params(),all_set,num_boost_round=10000) #generate result test_data = pd.read_csv("test.csv") test_drID = test_data.drop(["id"],axis=1) test_drID["penalty"] = le.transform(test_drID["penalty"]) test_drID = test_drID.fillna(0) test_set = min_max_scaler.transform(test_drID) test_set = varSel.transform(test_set) test_set = poly.fit_transform(test_set) test_set = xgb.DMatrix(test_set) xgbResult = XGBmodel.predict(test_set) for i in range(0,len(xgbResult)): if xgbResult[i]<0: xgbResult[i]=0.2 xgbResult = xgbResult * 1.25
def cross_validation(dtrain, ytrain, predictors): #每次调整完一个参数,重新确定新的num_rounds dtrain = dtrain[predictors] xgb_model = XGBRegressor( learning_rate=0.5, max_depth=20, n_estimators=100, min_child_weight=1, gamma=0, objective='reg:linear', nthread=4, ) modelfit(xgb_model, dtrain, ytrain) print('tunning learning rate...') params = {'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(learning_rate=gsearch.best_params_['learning_rate']) print(gsearch.best_params_) print('tunning max_depth...') params = {'max_depth': [3, 5, 7, 9]} print(xgb_model.get_params()['n_estimators']) gsearch = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(max_depth=gsearch.best_params_['max_depth']) print(gsearch.best_params_) #choose best num_round modelfit(xgb_model, dtrain, ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning min_child_weight...') param_child_weight = {'min_child_weight': [1, 3, 5, 7]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_child_weight, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params( min_child_weight=gsearch.best_params_['min_child_weight']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning gamma...') param_gamma = {'gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_gamma, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params(gamma=gsearch.best_params_['gamma']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print(xgb_model.get_params()['n_estimators']) #print('tunning colsample_bylevel') #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]} #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) #gsearch.fit(dtrain.values,ytrain) #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel']) #tunning colsample_bytree print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print('num_rounds after tunning colsample_bylevel:%f' % xgb_model.get_params()['n_estimators']) print('tunning colsample_bytree...') param_colsample_bytree = {'colsample_bytree': [0.6, 0.7, 0.8, 1]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_colsample_bytree, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(dtrain.values, ytrain) xgb_model.set_params( colsample_bytree=gsearch.best_params_['colsample_bytree']) print(xgb_model.get_params()) modelfit(xgb_model, dtrain.values, ytrain) print('num_rounds after tunning colsample_bytree:%f' % xgb_model.get_params()['n_estimators']) # save and return model cur_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) pickle.dump( xgb_model, open('../models/autogridsearch_xgb_' + cur_time + '.model', 'wb')) cv_score(xgb_model, dtrain.values, ytrain) return xgb_model
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0 / sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0 / len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier(learning_rate=0.1, n_estimators=num_training_rounds, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', scale_pos_weight=1, seed=123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor(n_estimators=num_training_rounds, objective='reg:linear') # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [ x for x in self.df.columns if x not in [self.target_column, self.id_column] ] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) + ', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len( current_df[self.target_column].unique()) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) print('fitting model') self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict( current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba( current_df[self.predictors])[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score( current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score( current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt( metrics.mean_squared_error( current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: if self.target_type == 'binary': output = self.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = self.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb_load = self.load(filename) if self.target_type == 'binary': output = xgb_load.clf.predict_proba( self.test_df[self.predictors])[:, 1] elif self.target_type == 'linear': output = xgb_load.clf.predict( self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list( self.clf.booster().get_fscore().items()), key=operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([ df, pd.get_dummies( df[col]).rename(columns=lambda x: col + '_' + str(x)) ], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt / float( len(df[col]) ) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if col is not self.target_column: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[ col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = list(range(0, num_rows)) print('INDICES', indices) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s], :]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0]) param_test1 = { 'max_depth':[3,4,5,6,7], 'min_child_weight':[3,4,5,6,7] } gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=1000,