def run_xgb(**args): print("building xgb model:") xgb_model = XGBRegressor() xgb_model.fit(args["training_data"], args["training_label"]) output = xgb_model.predict(args["test_data"]) pickle.dump(xgb_model, open("xgb_testmodel.p", "wb")) output = list(map(lambda e: round(e), output)) print(output) pickle.dump(output, open("xgb_output.p", "wb")) return output
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor( learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe')
X_test_scaled = X_test_scaled_new trials = Trials() algo = partial(tpe.suggest, n_startup_jobs=10) best = fmin(lasso_f, lasso_space, algo=tpe.suggest, max_evals=2, trials=trials) best_nodes = parse_lasso_nodes(trials, lasso_space_nodes) save_inter_params(trials, lasso_space_nodes, best_nodes, "tmdb_box_office_prediction") rsg = train_lasso_model(best_nodes, X_train_scaled, Y_train) Y_pred = rsg.predict(X_test_scaled) data = {"id":data_test["id"], "revenue":Y_pred} output = pd.DataFrame(data = data) output.to_csv("lasso_predicton.csv", index=False) """ rfc_model = XGBRegressor(random_state=42).fit(X_train_scaled, Y_train) perm = PermutationImportance(rfc_model, random_state=42).fit(X_train_scaled, Y_train) feature_importances1 = perm.feature_importances_ #这是返回每个特征的权重 feature_importances_std = perm.feature_importances_std_ feature_importances2 = np.where(feature_importances1 > 0) #此时我记录下了每个特征的列数 X_train_scaled_new = X_train_scaled[ X_train_scaled.columns[feature_importances2]] X_test_scaled_new = X_test_scaled[X_test_scaled.columns[feature_importances2]] X_train_scaled = X_train_scaled_new X_test_scaled = X_test_scaled_new trials = Trials() algo = partial(tpe.suggest, n_startup_jobs=10) best = fmin(xgb_f, xgb_space, algo=tpe.suggest, max_evals=1, trials=trials) #一共这么多种组合1012000000
from xgboost.sklearn import XGBRegressor, XGBClassifier # 回归例子 data = np.array([ [5, 20, 1.1], [7, 30, 1.3], [21, 55, 1.7], [30, 60, 1.8], [26, 40, 1.6], ]) xgb = XGBRegressor(n_estimators=n_estimators, learning_rate=LR, max_depth=MAX_DEPTH, min_child_weight=min_child_weight, base_score=base_score, gamma=GAMMA) xgb.fit(data[:, :-1], data[:, -1]) print("xgboost:", xgb.predict(data[0, :-1].reshape(1, -1))) my_xgb_tree = XGBoostModel(target='regression', n_estimators=n_estimators, lr=LR, max_depth=MAX_DEPTH, min_child_weight=min_child_weight, reg_lambda=1, reg_alpha=0, base_score=base_score, gamma=GAMMA) my_xgb_tree.fit(data)
X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] X_test.head() y = np.log1p(train.SalePrice) xTrain, xTest, yTrain, yTest = train_test_split(X_train, y, test_size=0.2, random_state=0) #d_Train = xgb.DMatrix(xTrain, label=yTrain) #d_Test = xgb.DMatrix(yTest, label=xTest) mod1 = XGBRegressor(n_estimators=1000, learning_rate=0.05) mod1.fit(xTrain, yTrain, early_stopping_rounds=100, eval_set=[(xTest, yTest)], verbose=True) predictions = mod1.predict(xTest) print("MSE = " + str(mean_squared_error(predictions, yTest))) print(test.head()) print(predictions) print(predictions.dtype) predictions2 = np.exp(mod1.predict(X_test))
def predict(X, Y, test_X, best_algo): # PCA pred_test_Ys = [] if 'xgboost' in best_algo or 'lasso' in best_algo: pca = PCA(n_components=40) pca.fit(X) print("pca explained variance ratio: {}...".format( sum(pca.explained_variance_ratio_))) pca_X = pca.transform(X) pca_test_X = pca.transform(test_X) if 'lasso' in best_algo: # lasso model = Lasso(alpha=1.0) model.fit(pca_X, Y) pred_test_Y = model.predict(pca_test_X) pred_test_Ys.append(pred_test_Y) if 'xgboost' in best_algo: # Xgboost model = XGBRegressor( learning_rate=0.01, # 默认0.3 n_estimators=500, # 树的个数 max_depth=3, # min_child_weight=1, # gamma=0, # subsample=0.8, # colsample_bytree=0.8, # scale_pos_weight=1 ) model.fit(pca_X, Y) pred_test_Y = model.predict(pca_test_X) pred_test_Ys.append(pred_test_Y) if 'stepwise' in best_algo: # stepwise forward selection (by p value) all_feature_indices = set(range(X.shape[1])) selected_feature_indices = [0, 1, 2, 3, 4, 5] def mp_get_pvalue(ind): """get p value for newly added feature, which is the last feature""" model = sm.OLS(Y, X[:, list(selected_feature_indices) + [ind]]).fit() pvalue = model.pvalues[-1] return pvalue def get_pvalue(Y, X): """get p value for newly added feature, which is the last feature""" model = sm.OLS(Y, X).fit() pvalue = model.pvalues[-1] return pvalue while len(selected_feature_indices) < MAX_N_FEATURE_SELECT: unselected_feature_indices = all_feature_indices - set( selected_feature_indices) unselected_feature_indice1_pvalue0 = 100 # some random large p-value selected_feature_index = 0 # some random index # multi-processing (doesn't seem to speed up, moreover, costs a lot more time) # import time # pool = multiprocessing.Pool(4) # unselected_feature_indices_list = list(unselected_feature_indices) # start_time = time.time() # unselected_feature_pvalues = pool.map(mp_get_pvalue, unselected_feature_indices_list) # print("takes {}...".format(time.time() - start_time)) # selected_feature_index = unselected_feature_indices_list[int(np.argmin(unselected_feature_pvalues))] # selected_feature_indices += [selected_feature_index] # construct array of pvalues unselected_feature_indices_list = list(unselected_feature_indices) unselected_feature_pvalues = [ get_pvalue(Y, X[:, list(selected_feature_indices + [ind])]) for ind in tqdm(unselected_feature_indices_list) ] selected_feature_index = unselected_feature_indices_list[int( np.argmin(unselected_feature_pvalues))] selected_feature_indices += [selected_feature_index] model = sm.OLS(Y, X[:, selected_feature_indices]).fit() pred_test_Y = model.predict(test_X[:, selected_feature_indices]) pred_test_Ys.append(pred_test_Y) if 'knn' in best_algo: model = KNeighborsRegressor(n_neighbors=5) model.fit(X, Y) pred_train_Y = model.predict(X) factor = np.mean( [Y[i] / pred_train_Y[i] for i in range(len(pred_train_Y))]) pred_test_Y = factor * model.predict(test_X) pred_test_Ys.append(pred_test_Y) if 'ensemble' in best_algo: n_algo = len(pred_test_Ys) pred_test_Y = np.sum(np.array(pred_test_Ys), axis=0) / n_algo return pred_test_Y
df_train['DaysFromJan2016']= df_train['DaysFromJan2016'].dt.days df_train = df_train.drop('Date', axis = 1) df_train.head() X_train, X_test, y_train, y_test = train_test_split(df_train.drop('Price', axis = 1), df_train['Price'], test_size = 0.3, random_state = 100) """<h1 id="Model-Building">Model Building<a class="anchor-link" href="#Model-Building">¶</a></h1><hr/> <p><img src="https://slideplayer.com/slide/15204316/92/images/1/What+is+Regression+Analysis.jpg"/></p> ##1. Xgboost #### For train eval """ from xgboost.sklearn import XGBRegressor model = XGBRegressor(objective="reg:linear", random_state=42) t0 = time.time() model.fit(X_train, y_train) print ("fitting time:", round(time.time()-t0, 3), "s") y_pred = model.predict(X_test) mse=metrics.mean_squared_error(y_test, y_pred) mse """#### For test eval""" y_pred = model.predict(X_train) mse=metrics.mean_squared_error(y_train, y_pred) mse
num_rounds = cv.shape[0] - 1 print('Best rounds: ', num_rounds) params = { 'n_estimators': num_rounds, 'learning_rate': 0.01, 'objective': 'reg:linear', 'subsample': 0.8, 'colsample_bytree': 0.5, 'min_child_weight': 1.1, 'max_depth': 4, 'silent': 1, } model = XGBRegressor(**params) print('Starting Cross Validation...') score = cross_val_score(model, train_x1, train_y1, cv=5) print('Score: ', score) print('Mean CV scores: ', np.mean(score)) print('Training...') model.fit(train_x1, train_y1) print('Predicting...') ######################################################################## #lasso ####################################################################### from sklearn.linear_model import Lasso, ElasticNet best_alpha = 0.0015
# ## INTERJECTION: Change models? # At this point I'd thought it would be wise to try a different modelling technique. Even with the best will in the world, I was getting issues with the ever-reliable random forest. Namely, there was huge noise in my cross validation score, which was making choosing the right data engineering steps a bit of a nightmare. Also RF regression is tediously slow. Enter XGBoost. # # I won't explain how XGBoost works, as there is literature online which can explain it better than I ever will, but suffice to say it is similar to RF in that it combines a lot of trees together, but unlike RF it doesn't build them in a random manner. # In[ ]: from xgboost.sklearn import XGBRegressor # In[ ]: xgb_test = XGBRegressor(learning_rate=0.05,n_estimators=500,max_depth=3,colsample_bytree=0.4) cv_score = cross_val_score(xgb_test, train_med.drop(['SalePrice','Id'], axis = 1), train_med['SalePrice'], cv = 5, n_jobs = -1) # In[ ]: print('CV Score is: '+ str(np.mean(cv_score))) # This is quite a bit stronger than RF, submitting yields 0.13031, which puts us strongly in the top 50%. A while to go yet, but we are moving in the right direction. Furthermore, let's move to using XGBoost as our regression method now. # ## 4. Categoric to Numeric # This is interesting. Some of the fields regarding the quality of the property are 'secretly' ordinal. Case in point, the field entitled BsmtCond, which has different quality ratings. Perhaps turning these into their numeric correspondent will improve performance, as we will be able to mine out better trends.
def cross_validation(dtrain,ytrain,predictors): #每次调整完一个参数,重新确定新的num_rounds dtrain = dtrain[predictors] xgb_model = XGBRegressor( learning_rate= 0.5, max_depth = 20, n_estimators = 100, min_child_weight = 1, gamma = 0, objective='reg:linear', nthread=4, ) modelfit(xgb_model,dtrain,ytrain) print('tunning learning rate...') params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate']) print(gsearch.best_params_) print('tunning max_depth...') params = { 'max_depth':[3,5,7,9]} print(xgb_model.get_params()['n_estimators']) gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(max_depth = gsearch.best_params_['max_depth']) print(gsearch.best_params_) #choose best num_round modelfit(xgb_model,dtrain,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning min_child_weight...') param_child_weight = {'min_child_weight':[1,3,5,7]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) print('tunning gamma...') param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(gamma = gsearch.best_params_['gamma']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print(xgb_model.get_params()['n_estimators']) #print('tunning colsample_bylevel') #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]} #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) #gsearch.fit(dtrain.values,ytrain) #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel']) #tunning colsample_bytree print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators']) print('tunning colsample_bytree...') param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]} gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5) gsearch.fit(dtrain.values,ytrain) xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree']) print(xgb_model.get_params()) modelfit(xgb_model,dtrain.values,ytrain) print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators']) # save and return model cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime()) pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb')) cv_score(xgb_model,dtrain.values,ytrain) return xgb_model
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
'colsample_bytree':np.arange(0.1,1.0,0.1) } # param_grid = { # 'max_depth':[7,8], # 'min_child_weight':[4,5] # } #gsearch1 = GridSearchCV(estimator = XGBClassifier( # learning_rate =0.1, n_estimators=140, max_depth=9, # min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8, # objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27), # param_grid=param_grid,cv=10) gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.2, objective= 'binary:logistic', booster= 'gbtree', eta=0.2, max_depth=4, # 4 3 colsample_bytree=0.7, #0.8 subsample= 0.7, min_child_weight=1, # 2 3 silent= 0, eval_metric='error',), param_grid=param_grid,cv=10) gsearch1.fit(np.array(x_train),np.array(y_train)) print(gsearch1.best_params_,gsearch1.best_score_)
print(best) best = {'alpha': 2.0, 'colsample_bytree': 0.9, 'eta': 0.04, 'gamma': 0.0, 'lambda': 1.8, 'max_depth': 9, 'min_child_weight': 4.0, 'n_estimators': 968.0, 'subsample': 0.55} # launch prediction with this parameters from xgboost.sklearn import XGBRegressor clf = XGBRegressor( learning_rate=float(best['eta']), max_depth=int(best['max_depth']), min_child_weight=float(best['min_child_weight']), subsample=float(best['subsample']), gamma=float(best['gamma']), colsample_bytree=float(best['colsample_bytree']), n_estimators=int(best['n_estimators']), reg_alpha=float(best['alpha']), reg_lambda=float(best['lambda']), objective='reg:linear', eval_metric='mae', nthread=-1, booster='gbtree', tree_method='exact', silent=1 ) # test clf.fit(X, y) y_hat = clf.predict(X_test) dscores = metrics_regression(y_test, y_hat, X.shape[1]) tf = t.since('test') print('\nBayesian tuning - test: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
def run(self, train_data_path): """Takes argument 'train_data_path'. train_data_path: Training data path. Performs models selection process on the specified order. A no. of reqred models can added to this method body and corss validated These can be saved as it is or ensembling can be applied. """ #Loading training data dtrain = pd.read_csv(train_data_path) #gets predictors predictor_vars = self.get_predictors(dtrain) #Model I xgboost = XGBRegressor(learning_rate=0.06, n_estimators=1000, max_depth=2, min_child_weight=2, gamma=0, subsample=0.4, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=77) #Model II xgboost2 = XGBRegressor(learning_rate=0.04, n_estimators=1500, max_depth=2, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=99, reg_alpha=1.7) #Model III xgboost3 = XGBRegressor(learning_rate=0.02, n_estimators=1200, max_depth=3, min_child_weight=2, gamma=0, subsample=0.65, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=585, reg_alpha=5000) #Model IV lightgbm = LGBMRegressor(objective='regression', num_leaves=4, min_data_in_leaf=5, learning_rate=0.02, n_estimators=3000, max_bin=320, bagging_fraction=0.85, bagging_freq=10, bagging_seed=9, feature_fraction=0.2, feature_fraction_seed=9, data_random_seed=9, reg_alpha=0.55, reg_lambda=0.3, verbose=-1) #Model V lightgbm2 = LGBMRegressor(objective='regression', num_leaves=4, min_data_in_leaf=3, learning_rate=0.01, n_estimators=4000, max_bin=295, bagging_fraction=0.5, bagging_freq=10, bagging_seed=24, feature_fraction=0.2, feature_fraction_seed=24, data_random_seed=24, reg_alpha=10, reg_lambda=0.7, verbose=-1) #Ensembling all the five models ens_model = EnsembleRegressor( [xgboost, xgboost2, xgboost3, lightgbm, lightgbm2]) #Performs cross validation on the ensembled model. self.cross_validate(cv=5, model=ens_model, X=dtrain[predictor_vars], y=dtrain[self.target_var], n_jobs=1) #CV Score is: 0.92528287952747 all predictors #Saving the final model. self.finalize_and_save(ens_model, self.output_file_path, dtrain[predictor_vars], dtrain[self.target_var])
print(" Score (Train): %f" % mean_squared_error(train_Y.values, dtrain_predictions)) #Predict on testing data: dtest_predictions = alg.predict(test_X) print("Score (Test): %f" % mean_squared_error(test_Y.values, dtest_predictions)) XGBmodel = XGBRegressor(booster='gbtree', objective= 'reg:linear', eval_metric='rmse', gamma = 0.1, min_child_weight= 1.1, max_depth= 5, subsample= 0.7, colsample_bytree= 0.7, tree_method= 'exact', learning_rate=0.1, n_estimators=300, nthread=4, scale_pos_weight=1, seed=27 ) modelfit(XGBmodel) #adjust parameters param_test1 = { 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe')
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)
class XGBoostModel: def __init__(self): self.model = None def train(self, X_train, X_test, y_train, y_test): ''' Trains the machine learning model based on the dataframe provided as input. The fitted model will be saved under model/xgboost.pkl The function returns the MSE and the RMSE :param df: :return: RMSE and MAE scores ''' print('Training is starting...') eval_set = [(X_train, y_train), (X_test, y_test)] self.model = XGBRegressor(max_depth=7, objective='reg:squarederror', gamma=0, learning_rate=0.03, subsample=1, colsample_bytree=0.9, min_child_weight=10) self.model.fit(X_train, y_train, eval_set=eval_set, eval_metric="rmse", early_stopping_rounds=500) predictions = self.predict(X_test) with open('generated/gxboost_model.pickle', 'wb') as file: pickle.dump(self.model, file) self.evaluate(y_test, X_test) def predict(self, X_test): predictions = self.model.predict(X_test) return predictions def grid_search(self, X_train, X_test, y_train, y_test): grid_param = { 'max_depth': [n for n in range(2, 10)], 'gamma': np.arange(0, 0.5, 0.1), 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'subsample': np.arange(0.5, 0.9, 0.1), 'colsample_bytree': np.arange(0.5, 0.9, 0.1), 'min_child_weight': [1, 3, 5, 7] } model = XGBRegressor(max_depth=7, objective='reg:squarederror', gamma=0, learning_rate=0.03, subsample=1, colsample_bytree=0.9, min_child_weight=10) gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='neg_mean_squared_error', cv=5, n_jobs=-1) gd_sr.fit(X_train, y_train) best_parameters = gd_sr.best_params_ print(best_parameters) def evaluate(self, y_test, X_test): print('#' * 15 + ' Model Evaluation ' + '#' * 15) print() predictions = self.predict(X_test) predictions = MLProcessing.invert_scaling(predictions) y_test = MLProcessing.invert_scaling(np.array(y_test)) rmse = np.sqrt(mean_squared_error(y_test, predictions)) mae = mean_absolute_error(y_test, predictions) print('RMSE: {} - MAE: {}'.format(rmse, mae)) print() print('#' * 48)
def fit(self, inputs_train, labels_train, fit_options={}): xgb_reg = XGBRegressor(random_state=self.options['seed']) print('Starting with low learning rate and tuning: \ max_depth, min_child_weight, n_estimators') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta # np.arange(2,14,2), "max_depth": self.options['max_depth'], # np.arange(1,7,6), "min_child_weight": self.options['min_child_weight'], # np.arange(10,80,10), "n_estimators": self.options['n_estimators'], "colsample_bytree": [0.8], "subsample": [0.8], "gamma": [0], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: gamma') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "colsample_bytree": [0.8], "subsample": [0.8], # np.arange(0.05,0.45,0.05), "gamma": self.options['gamma'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: colsample_bytree, subsample') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], # np.arange(0.60, 0.95, 0.05), "colsample_bytree": self.options['colsample_bytree'], # np.arange(0.60, 0.95, 0.05), "subsample": self.options['subsample'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: reg_alpha, reg_lambda') params = { "learning_rate": [0.1], # np.arange(0.05,0.45,0.05), #eta "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], "colsample_bytree": [GSCV.best_params_['colsample_bytree']], "subsample": [GSCV.best_params_['subsample']], # ,[1e-5, 1e-2, 0.1, 1, 10], #alpha "reg_alpha": self.options['reg_alpha'], # [1e-5, 1e-2, 0.1, 1, 10],#lambda "reg_lambda": self.options['reg_lambda'], } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Tuning: learning_rate') params = { # np.arange(0.025,0.150,0.025), #np.arange(0.05,0.45,0.05), #eta "learning_rate": self.options['learning_rate'], "max_depth": [GSCV.best_params_['max_depth']], "min_child_weight": [GSCV.best_params_['min_child_weight']], "n_estimators": [GSCV.best_params_['n_estimators']], "gamma": [GSCV.best_params_['gamma']], "colsample_bytree": [GSCV.best_params_['colsample_bytree']], "subsample": [GSCV.best_params_['subsample']], "reg_alpha": [GSCV.best_params_['reg_alpha']], # alpha "reg_lambda": [GSCV.best_params_['reg_lambda']] # lambda } GSCV = GridSearchCV( xgb_reg, # , #np.arange(0.05,0.45,0.05), #eta), params, cv=self.options['cv'], scoring=self.options['scoring'], n_jobs=self.options['n_jobs'], verbose=self.options['verbose'], # verbose, return_train_score=True) GSCV.fit(inputs_train, labels_train) print('best_params_:', GSCV.best_params_) # , print('best_score_:', GSCV.best_score_) print('Final model') # Regression regressor = XGBRegressor(random_state=self.options['seed']) # seed) regressor.set_params(**GSCV.best_params_) trained_regressor = regressor.fit(inputs_train, labels_train) self.regressor = trained_regressor self.feature_importances_ = self.regressor.feature_importances_
#Fit the algorithm on the data print 'kaishixunlian' alg.fit(train_data, train_target) print 'kaishiyuce' dtrain_predictions = alg.predict(test_data) daochu(dtrain_predictions) print 'Feature Importance' get_xgb_feat_importances(alg) # Print Feature Importance: # feat_imp = pd.Series(get_xgb_feat_importances(alg)(1)).sort_values(ascending=False) # feat_imp.plot(kind='bar', title='Feature Importances') # plt.ylabel('Feature Importance Score') # plt.show() # get_xgb_feat_importances(alg) xgb1 = XGBRegressor(learning_rate=0.01, n_estimators=1000, max_depth=6, min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005, objective='reg:linear', nthread=14, scale_pos_weight=3, seed=2016) modelfit(xgb1, train_data, train_target, test_data)
# 载入train和test数据集 dataSet = train_data.iloc[:, :-3] labelSet = train_data.iloc[:, -3:] y_label = [] for i in range(test_features.shape[1]): train_text, test_text, train_labels, test_labels = train_test_split( dataSet.iloc[:, i], labelSet, test_size=0.33, random_state=23333) train_text = np.mat(train_text).T test_text = np.mat(test_text).T # ============================================================================== # 模型选择 # ============================================================================== model_lscv = LassoCV() model_xgb = XGBRegressor() model_rfg = RandomForestRegressor() model_gb = GradientBoostingRegressor() preds1_test1, preds1_test2 = model_select(model_lscv) preds2_test1, preds2_test2 = model_select(model_xgb) preds3_test1, preds3_test2 = model_select(model_rfg) preds4_test1, preds4_test2 = model_select(model_gb) pred1 = (preds1_test1 + preds2_test1 + preds3_test1 + preds4_test1) / 4 pred2 = (preds1_test2 + preds2_test2 + preds3_test2 + preds4_test2) / 4 print('-----第%s个特征-------' % i) print('Fit score1 + score2: The pearsonr of test set is {}'.format( pearsonr(list(test_labels.iloc[:, -1]), list(pred1))[0])) print('Only fit score: the pearsonr of test set is {}'.format( pearsonr(list(test_labels.iloc[:, -1]), list(pred2))[0]))
#from sklearn.ensemble import GradientBoostingRegressor # 1 #model = GradientBoostingRegressor() # 2 #from sklearn.ensemble import RandomForestClassifier #model = RandomForestClassifier(n_estimators = 200, max_depth = 50) import xgboost as xgb from xgboost.sklearn import XGBRegressor model = XGBRegressor(learning_rate=0.1, n_estimators=310, max_depth=5, min_child_weight=2, gamma=0.2, subsample=0.85, colsample_bytree=0.65, objective='reg:linear', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=84) model.fit(x_train, y_train) #model.fit(x_data, y_data) y_test = model.predict(x_test) ##################################################################### # Uncomment the two lines below to roughly check the accuracy of # your model on the validation data #y_pred = forest.predict(x_valid)
def train(X, Y): """search params and try different algs in ALGOS_TO_RUN; save a recipe, which contains the best algo (xgboost, ensemble, lasso, etc.) for the city""" # K-fold crossvalidation kfold = KFold(n_splits=K_FOLD_SPLITS) train_Ys, valid_Ys, train_metrics, valid_metrics, train_ensemble, valid_ensemble = {}, {}, {}, {}, {}, {} for algo in ALGOS_TO_RUN: train_Ys[algo] = [] valid_Ys[algo] = [] train_metrics[algo] = [] valid_metrics[algo] = [] train_ensemble = [] valid_ensemble = [] for kf_id, (train_indices, valid_indices) in enumerate(kfold.split(X)): if kf_id >= K_FOLDS_TO_RUN: break train_X = X[train_indices] val_X = X[valid_indices] train_Y = Y[train_indices] val_Y = Y[valid_indices] train_ensemble1 = [] valid_ensemble1 = [] # PCA if 'xgboost' in ALGOS_TO_RUN or 'lasso' in ALGOS_TO_RUN: pca = PCA(n_components=40) pca.fit(train_X) print("pca explained variance ratio: {}...".format( sum(pca.explained_variance_ratio_))) pca_train_X = pca.transform(train_X) pca_val_X = pca.transform(val_X) if 'lasso' in ALGOS_TO_RUN: # lasso model = Lasso(alpha=1.0) model.fit(pca_train_X, train_Y) pred_train_Y = model.predict(pca_train_X) pred_valid_Y = model.predict(pca_val_X) fold_train_mae = np.mean(abs(train_Y - pred_train_Y)) fold_val_mae = np.mean(abs(val_Y - pred_valid_Y)) train_Ys['lasso'].append(train_Y) valid_Ys['lasso'].append(val_Y) train_metrics['lasso'].append(fold_train_mae) valid_metrics['lasso'].append(fold_val_mae) train_ensemble1.append(train_Y - pred_train_Y) valid_ensemble1.append(val_Y - pred_valid_Y) if 'xgboost' in ALGOS_TO_RUN: # Xgboost model = XGBRegressor( learning_rate=0.01, # 默认0.3 n_estimators=500, # 树的个数 max_depth=3, # min_child_weight=1, # gamma=0, # subsample=0.8, # colsample_bytree=0.8, # scale_pos_weight=1 ) model.fit(pca_train_X, train_Y) pred_train_Y = model.predict(pca_train_X) pred_valid_Y = model.predict(pca_val_X) fold_train_mae = np.mean(abs(train_Y - pred_train_Y)) fold_val_mae = np.mean(abs(val_Y - pred_valid_Y)) train_Ys['xgboost'].append(train_Y) valid_Ys['xgboost'].append(val_Y) train_metrics['xgboost'].append(fold_train_mae) valid_metrics['xgboost'].append(fold_val_mae) train_ensemble1.append(train_Y - pred_train_Y) valid_ensemble1.append(val_Y - pred_valid_Y) if 'stepwise' in ALGOS_TO_RUN: # stepwise forward selection (by p value) all_feature_indices = set(range(train_X.shape[1])) selected_feature_indices = [0, 1, 2, 3, 4, 5] def mp_get_pvalue(ind): """get p value for newly added feature, which is the last feature""" model = sm.OLS( train_Y, train_X[:, list(selected_feature_indices) + [ind]]).fit() pvalue = model.pvalues[-1] return pvalue def get_pvalue(Y, X): """get p value for newly added feature, which is the last feature""" model = sm.OLS(Y, X).fit() pvalue = model.pvalues[-1] return pvalue while len(selected_feature_indices) < MAX_N_FEATURE_SELECT: unselected_feature_indices = all_feature_indices - set( selected_feature_indices) unselected_feature_indice1_pvalue0 = 100 # some random large p-value selected_feature_index = 0 # some random index # multi-processing (doesn't seem to speed up, moreover, costs a lot more time) # import time # pool = multiprocessing.Pool(4) # unselected_feature_indices_list = list(unselected_feature_indices) # start_time = time.time() # unselected_feature_pvalues = pool.map(mp_get_pvalue, unselected_feature_indices_list) # print("takes {}...".format(time.time() - start_time)) # selected_feature_index = unselected_feature_indices_list[int(np.argmin(unselected_feature_pvalues))] # selected_feature_indices += [selected_feature_index] # construct array of pvalues unselected_feature_indices_list = list( unselected_feature_indices) unselected_feature_pvalues = [ get_pvalue( train_Y, train_X[:, list(selected_feature_indices + [ind])]) for ind in tqdm(unselected_feature_indices_list) ] selected_feature_index = unselected_feature_indices_list[int( np.argmin(unselected_feature_pvalues))] selected_feature_indices += [selected_feature_index] model = sm.OLS(train_Y, train_X[:, selected_feature_indices]).fit() pred_train_Y = model.predict(train_X[:, selected_feature_indices]) pred_valid_Y = model.predict(val_X[:, selected_feature_indices]) # print("avg Y: {}, train mae is: {}, val mae is: {}".format(np.mean(Y), # np.mean(abs(train_Y - pred_train_Y)), # np.mean(abs(val_Y - pred_valid_Y)))) fold_train_mae = np.mean(abs(train_Y - pred_train_Y)) fold_val_mae = np.mean(abs(val_Y - pred_valid_Y)) train_Ys['stepwise'].append(train_Y) valid_Ys['stepwise'].append(val_Y) train_metrics['stepwise'].append(fold_train_mae) valid_metrics['stepwise'].append(fold_val_mae) train_ensemble1.append(train_Y - pred_train_Y) valid_ensemble1.append(val_Y - pred_valid_Y) if 'knn' in ALGOS_TO_RUN: model = KNeighborsRegressor(n_neighbors=5) model.fit(train_X, train_Y) pred_train_Y = model.predict(train_X) factor = np.mean([ train_Y[i] / pred_train_Y[i] for i in range(len(pred_train_Y)) ]) pred_train_Y = factor * pred_train_Y pred_valid_Y = factor * model.predict(val_X) fold_train_mae = np.mean(abs(train_Y - pred_train_Y)) fold_val_mae = np.mean(abs(val_Y - pred_valid_Y)) train_Ys['knn'].append(train_Y) valid_Ys['knn'].append(val_Y) train_metrics['knn'].append(fold_train_mae) valid_metrics['knn'].append(fold_val_mae) train_ensemble1.append(train_Y - pred_train_Y) valid_ensemble1.append(val_Y - pred_valid_Y) train_ensemble.append( np.mean( abs( np.sum(np.array(train_ensemble1), axis=0) / len(ALGOS_TO_RUN)))) valid_ensemble.append( np.mean( abs( np.sum(np.array(valid_ensemble1), axis=0) / len(ALGOS_TO_RUN)))) for k in train_metrics: print("{}: avg train_Y: {}, avg val_Y: {}, train mae: {}, val mae: {}". format(k, np.mean(train_Ys[k]), np.mean(valid_Ys[k]), np.mean(train_metrics[k]), np.mean(valid_metrics[k]))) print("ensemble: train mae: {}, val mae: {}".format( np.mean(train_ensemble), np.mean(valid_ensemble))) algos = [k for k in train_metrics ] + ['ensemble-{}'.format('-'.join(train_metrics.keys()))] algos_scores = [np.mean(valid_metrics[k]) for k in train_metrics] + [np.mean(valid_ensemble)] return algos[int(np.argmin(algos_scores))]
def __init__(self, d): self.linear_reg = linear_model.Ridge() self.xgb_reg = XGBRegressor(max_depth=7) self.d = d
def get_best_model(self): rgr = None params = None if self.model_name_ == "xgboost": rgr = XGBRegressor(colsample_bytree=0.2, gamma=0.0, learning_rate=0.5, max_depth=6, min_child_weight=1.5, n_estimators=2000, reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1) #rgr = XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, # colsample_bytree=0.8, objective="reg:squarederror", scale_pos_weight=1, seed=10) params = {} elif self.model_name_ == "lightgbm": rgr = LGBMRegressor(boosting_type='gbdt', objective='regression', n_estimators=300, metric={'l2', 'l1'}, num_leaves=31, learning_rate=0.05, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, verbose=0) params = {} elif self.model_name_ == "svr": rgr = SVR( kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, C=1.0, #epsilon=0.1, epsilon=1.0, shrinking=True, cache_size=200, verbose=False, max_iter=-1) params = {} X_train, X_train_tmp, y_train, y_train_tmp = None, None, None, None for i in range(1, 5): file_name = "train_{0}.csv".format(i) X_train_tmp, y_train_tmp = HandlerFeatures( file_name).get_train_features() if X_train is None: X_train = X_train_tmp[:] y_train = y_train_tmp[:] else: X_train = np.append(X_train, X_train_tmp, axis=0) y_train = np.append(y_train, y_train_tmp, axis=0) grid = GridSearchCV(estimator=rgr, param_grid=params, cv=5, scoring=None, iid=False, n_jobs=-1) grid.fit(X_train, y_train) self.best_model_ = grid.best_estimator_
xgb_params = { 'max_depth': 100, 'random_state': 10, 'n_estimators': 1500, 'learning_rate': 0.1, 'silent': False, 'booster': 'gbtree', 'min_child_weight': 57, 'gamma': 1.45, 'alpha': 0.0, 'subsample': 0.67, 'colsample_bytree': 0.054, 'colsample_bylevel': 0.5, 'metric': 'rmse' } model = CrossValRegressor(XGBRegressor(**xgb_params), n_split=10) _, _ = model.fit(X_train.values, np.log1p(Y_train.values), X_val.values, np.log1p(Y_val.values), eval_metric=RMSE) model.save_models('test_regressionxgbcv.pkl') del model with open('test_regressionxgbcv.pkl', 'rb') as f: model = pickle.load(f) preds = model.predict(X_val.values) print(RMSE(np.log1p(Y_val.values), preds)) model_single = XGBRegressor(**xgb_params) model_single.fit(X_train, np.log1p(Y_train),
from xgboost.sklearn import XGBRegressor params = { 'learning_rate': 0.1, 'n_estimators': 1000, 'subsample': 0.8, 'colsample_bytree': 0.6, 'max_depth': 12, 'min_child_weight': 1, 'reg_alpha': 1, 'gamma': 0 } regressor = XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'], booster='gbtree', objective='reg:linear', n_jobs=-1, subsample=params['subsample'], colsample_bytree=params['colsample_bytree'], random_state=0, max_depth=params['max_depth'], gamma=params['gamma'], min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha']) regressor.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_set=eval_set) # use the trained model to predict in the test data test_df['prediction'] = regressor.predict(test_df[train_feature].values) test_df.iloc[:5] df2 = pd.merge(df2, test_df[['date','prediction']], on=['date'],how='left') df2.iloc[-5:] # calculate the final speed df2['imputationa1'] = df2['speed'].isnull() df2['speed'] = df2['speed'].fillna(value=df2['prediction'])
def run_find(x_train, y_train, i, x_predict): # 找到合适的参数调优的估计器数目 clf = XGBRegressor( objective='reg:linear', learning_rate=0.1, # [默认是0.3]学习率类似,调小能减轻过拟合,经典值是0.01-0.2 gamma= 0, # 在节点分裂时,只有在分裂后损失函数的值下降了,才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。这个参数值越大,算法越保守。 subsample=0.8, # 随机采样比例,0.5-1 小欠拟合,大过拟合 colsample_bytree=0.8, # 训练每棵树时用来训练的特征的比例 reg_alpha=1, # [默认是1] 权重的L1正则化项 reg_lambda=1, # [默认是1] 权重的L2正则化项 max_depth=10, # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10 min_child_weight= 1, # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。 ) nums, fscore = modelfit(clf, x_train, y_train, cv_folds=5, early_stopping_rounds=50, feval=evalerror) print('test_estimators:', nums) clf.set_params(n_estimators=nums) # 1 先对 max_depth和min_child_weight 这两个比较重要的参数进行调优 ## 粗调: param_test1 = { 'max_depth': [i for i in range(3, 12, 2)], 'min_child_weight': [i for i in range(1, 10, 2)] } best_params, best_score = find_params(param_test1, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调: max_d = best_params['max_depth'] min_cw = best_params['min_child_weight'] param_test2 = { 'max_depth': [max_d - 1, max_d, max_d + 1], 'min_child_weight': [min_cw - 1, min_cw, min_cw + 1] } best_params, best_score = find_params(param_test2, clf, x_train, y_train) clf.set_params(max_depth=best_params['max_depth'], min_child_weight=best_params['min_child_weight']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 2 对 gamma 进行调参: ## 粗调: param_test3 = {'gamma': [i / 10.0 for i in range(0, 10, 2)]} best_params, best_score = find_params(param_test3, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调: b_gamma = best_params['gamma'] param_test4 = {'gamma': [b_gamma, b_gamma + 0.1, b_gamma + 0.2]} best_params, best_score = find_params(param_test4, clf, x_train, y_train) clf.set_params(gamma=best_params['gamma']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 3 对subsample和colsample_bytree进行调参 ## 粗调 param_test5 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } best_params, best_score = find_params(param_test5, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调 b_subsample = best_params['subsample'] b_colsample_bytree = best_params['colsample_bytree'] param_test6 = { 'subsample': [b_subsample - 0.05, b_subsample, b_subsample + 0.05], 'colsample_bytree': [ b_colsample_bytree - 0.05, b_colsample_bytree, b_colsample_bytree + 0.05 ] } best_params, best_score = find_params(param_test6, clf, x_train, y_train) clf.set_params(subsample=best_params['subsample'], colsample_bytree=best_params['colsample_bytree']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 4 对 reg_alpha和lambda 进行调节 ## 粗调 param_test7 = { 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 2], 'reg_lambda': [1e-5, 1e-2, 0.1, 1, 2] } best_params, best_score = find_params(param_test7, clf, x_train, y_train) print('model', i, ':') print(best_params, ':best_score:', best_score) ## 精调 b_alp = best_params['reg_alpha'] b_lam = best_params['reg_lambda'] param_test8 = { 'reg_alpha': [b_alp, 2 * b_alp, 3 * b_alp], 'reg_lambda': [b_lam, 2 * b_lam, 3 * b_lam] } best_params, best_score = find_params(param_test7, clf, x_train, y_train) clf.set_params(reg_alpha=best_params['reg_alpha'], reg_lambda=best_params['reg_lambda']) print('model', i, ':') print(best_params, ':best_score:', best_score) # 5 调小learning_rate, 提高迭代次数 clf.set_params(learning_rate=0.01) nums, fscore = modelfit(clf, x_train, y_train, cv_folds=5, early_stopping_rounds=50, feval=evalerror) clf.set_params(n_estimators=nums) clf.fit(x_train, y_train) y_predict = clf.predict(x_predict) return y_predict, fscore
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.impute import SimpleImputer from xgboost.sklearn import XGBRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score data = pd.read_csv('train.csv') data.dropna(axis=0, subset=['SalePrice'], inplace=True) y = data.SalePrice X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']) # select XGBRegressor my_model = XGBRegressor(n_estimators=750, learning_rate=0.02) # make pipeline my_pipeline = make_pipeline(SimpleImputer(), my_model) scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=2) print('Mean Absolute Error %2f' %(-1 * scores.mean()))
Y_pred = rfr.predict(X_test) print("Accuracy:", rfr.score(X_test, Y_test)) rootsqerr = sqrt(mean_squared_error(Y_test, Y_pred)) print("RMSE:", rootsqerr) #################################### # XGBoostRegressor #################################### print("XGBoost Regressor...") xgb1 = XGBRegressor() parameters = { 'nthread': [1, 2, 3, 4], 'objective': ['reg:linear'], 'learning_rate': [.03, 0.05, .07], 'max_depth': [5, 6, 7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [100, 200, 500] } xgb_grid = GridSearchCV(xgb1, parameters, cv=5, n_jobs=5, verbose=True) xgb_grid.fit(X_train, Y_train)
dense1_layer_model = Model(inputs=model.input, outputs=model.get_layer('fc1').output) model.summary() return model,dense1_layer_model #CNN 模型 model,dense1_layer_model = get_model() # cnn提取特征+lgb dense1_train = dense1_layer_model.predict(X_tr) nn_train = pd.DataFrame(dense1_train, columns=['nn_%d' % column for column in range(10)]) dense1_test = dense1_layer_model.predict(X_te) nn_test = pd.DataFrame(dense1_test, columns=['nn_%d' % column for column in range(10)]) clf = XGBRegressor().fit(nn_train,y_train) y_pre = clf.predict(nn_test) print('CNN(特征)+lgb:') print(mean_squared_error(y_pre,y_test)) #原始特征 #================================================================= def get_original_feature(train,test,seq_len): predictor = [column for column in train.columns if column not in ['year','lunar_year', 'lunar_xun','djz','date','count1','virtual_date']] train = train[predictor] test = test[predictor] train = train[seq_len:] test = test[seq_len:]
labels = labels[marks] # x_test = loadtxt('train_feature.txt', delimiter=' ') seed = 10 test_size = 0.3 max_idx = list() max_score = 0 for i in range(1, 7): sub_idxs = list(combinations(idx, i)) for sub_idx in sub_idxs: # print(list(sub_idx), raw_x.shape) x = raw_x[:, list(sub_idx)] # print(len(x)) x_train, x_test, y_train, y_test = train_test_split( x, labels, test_size=test_size, random_state=seed) # model = XGBClassifier(learning_rate=0.01, # # seed=seed, # max_depth=10, # silent=1) model = XGBRegressor(max_depth=10) model.fit(x_train, y_train) y_pre = model.predict(x_test) predictions = [round(value) for value in y_pre] accuracy = accuracy_score(y_test, predictions) if accuracy > max_score: max_score = accuracy max_idx = np.copy(sub_idx) print(max_score, max_idx)
warnings.filterwarnings('ignore') # 1 데이터 dataset = load_diabetes() x = dataset.data y = dataset.target x_train , x_test,y_train ,y_test = train_test_split( x, y, train_size = 0.8, random_state=104) KFold = KFold(n_splits=5,shuffle=True) # (shuffle=False : 순차적) st = datetime.datetime.now() parameters = [ {'n_estimoters':[100,200,300], 'learning_rate':[0.1,0.3,0.001,0.01], 'max_depth':[4,5,6]}, {'n_estimoters':[90,100,110], 'learning_rate':[0.1,0.001,0.01], 'max_depth':[4,5,6],'colsample_bytree':[0.6,0.9,1]}, {'n_estimoters':[100,110], 'learning_rate':[0.1,0.5,0.001], 'max_depth':[4,5,6],'colsample_bytree':[0.6,0.9,1],'colsample_bylevel':[0.6,0.7,0.9]} ] #2 모델 구성 model = RandomizedSearchCV(XGBRegressor(eval_metric='mlogloss'), parameters, cv = KFold ) score = cross_val_score(model, x_train,y_train,cv= KFold) print(score) et = datetime.datetime.now() print(et-st)
def save_result(data, columns): """ :param data: np.array :param columns: list """ index = [] for i in range(len(data)): index.append(i + 1461) data = np.column_stack((index, data)) df = pd.DataFrame(data, columns=columns) df['Id'] = df['Id'].astype('int') df.to_csv("result.csv", index=False) # RF Model rf = XGBRegressor() #rf = RandomForestRegressor() for i in range(Config.max_step + 1): # features, labels = train.next_batch(Config.batch_size) rf.fit(train.df, train.labels) if i % 20 == 0: print("Step: %d" % i) _pre = rf.predict(test.df) N = len(_pre) print(np.c_[_pre, result['SalePrice']]) print(np.sqrt(np.sum(np.log10(result['SalePrice'] / _pre)) / N)) result = rf.predict(test.df) save_result(result, ['Id', 'SalePrice'])
error_df[error_df.pred.isnull()] pd.Series(rf_age_valid_pred).isnull().sum() statistics.mean(error_df.sqerr) len(rf_age_valid_pred) len(age_valid_Y) pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y) ## XGB for age prediction from xgboost.sklearn import XGBRegressor xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100, objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321) eval_set = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)] xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1) xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) ## ADAboost for age prediction from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321) ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,) ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) len(ada_age_valid_pred) len(age_valid_Y)
print("Test RMSE : %.4g" % mean_squared_error(y_test.values, dtest_prediction) ** 0.5) # feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) # feat_imp.plot(kind='bar', title='Feature Importance') # plt.ylabel('Feature Importance Score') # plt.show() plot_importance(alg) plt.show() importances = alg.feature_importances_ return dtrain_prediction, dtest_prediction # 初始化模型 xgb0 = XGBRegressor(random_state=10, importance_type='gain') start = time() dtrain_prediction, dtest_prediction = modelfit(xgb0, X_train, y_train, X_val, y_val) end = time() print("the model fit time: %.4f" % (end-start)) train_out = pd.DataFrame(list(zip(y_train.values.flatten(), pd.Series(dtrain_prediction))), index=y_train.index, columns=['y_true', 'y_pred']) test_out = pd.DataFrame(list(zip(y_val.values.flatten(), pd.Series(dtest_prediction))), index=y_val.index, columns=['y_true', 'y_pred']) # step4:特征选择 model = SelectFromModel(xgb0, prefit=True) selection_X_train = model.transform(X_train) selection_X_val = model.transform(X_val)
target_raw = boston.target X_train, X_test, y_train, y_test = train_test_split(data_raw, target_raw, test_size=0.1, random_state=33) feature_names = boston.feature_names # ============================================================================= # ============================================================================= # 建模并训练 # 封装为sklearn格式的模型 # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn xgb = XGBRegressor() xgb.fit( X_train, y_train, early_stopping_rounds=10, # 当n次结果没有变小时提前终止 eval_set=[(X_train, y_train), (X_test, y_test)], # 评价集,类似于predict eval_metric=['rmse'] # 评价指标 'error','logloss','rmse','auc'等 # http://xgboost.readthedocs.io/en/latest/parameter.html ) # ============================================================================= # ============================================================================= # 预测并评价
split_num=int(38070*0.95) load1_train=load1.iloc[0:split_num,:] load1_test=load1.iloc[split_num:38070,:] load1_test2=load1[37089-48:37089] train_x, test_x, train_y, test_y = train_test_split(load1_train.drop('load1',1), load1_train['load1'],train_size=0.9, random_state=133) #Dtrain=xgb.DMatrix(train_x,train_y) #Dtest=xgb.DMatrix(test_x,test_y) #######################################################cv调参 xgb1 = XGBRegressor( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=1024) #####parameter 1max_depth xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='rmse', early_stopping_rounds=50) xgb1.set_params(n_estimators=cvresult.shape[0])