class my_model: def __init__(self, d): self.linear_reg = linear_model.Ridge() self.xgb_reg = XGBRegressor(max_depth=7) self.d = d def fit(self, X, y): self.linear_reg.fit(X[:, 0].reshape(-1, 1), y) self.l_reg_res = self.linear_reg.predict(X[:, 0].reshape(-1, 1)) self.xgb_reg.fit(X[:, 1:], y - self.l_reg_res) X_nn = np.hstack([ X, self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1), self.l_reg_res.reshape(-1, 1) ]) return X_nn def predict(self, X): if isinstance(X[0, -1], str): for i in range(X.shape[0]): X[i, -1] = self.d[X[i, -1]] X = X.astype(np.float64, copy=False) X_nn_final = np.hstack([ X, self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1), self.linear_reg.predict(X[:, 1].reshape(-1, 1)).reshape(-1, 1) ]) return X_nn_final
def train_xg_boost(params): xg_model = XGBRegressor(n_estimators=int(params['n_estimators']), learning_rate=params['eta'], n_jobs=-1, max_depth=int(params['max_depth']), gamma=params['gamma'], colsample_bytree=params['colsample_bytree'], min_child_weight=params['min_child_weight'], reg_alpha=params['xg_reg_alpha'], subsample=params['subsample'], reg_lambda=params['xg_reg_lambda'] ) # In[ ]: xg_model.fit(X_train.values, y_train.values) training_values = xg_model.predict(X_train.values) print(training_values) training_rmse = math.sqrt(mean_squared_error(y_train, training_values)) print("training_rmse", training_rmse) validation_values = xg_model.predict(X_validtn.values) validation_rmse = math.sqrt(mean_squared_error(y_validtn, validation_values)) print("validation_rmse", validation_rmse) """test_submission = pd.DataFrame() test_submission["Score"] = xg_model.predict(combined_test_data) test_submission.to_excel('submission4.xlsx', index=False)""" return { 'loss': validation_rmse, 'status': STATUS_OK, 'eval_time': time.time(), }
def xgb_regression(X_train,y_train,X_val, y_val,X_test,y_test,args): if y_test.shape[-1] == 1: model = XGBRegressor( learn_rate=0.1, max_depth=4, # 4 min_child_weight=10, gamma=1, # 1 subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1 ) model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=300) y_pred = model.predict(X_test) y_test = y_test.astype('float') MSE = mean_squared_error(y_test, y_pred) RMSE = MSE ** 0.5 return RMSE else: RMSEs = [] if len(y_train.shape) == 3: y_train = [x[0] for x in y_train] y_val = [x[0] for x in y_val] y_test = [x[0] for x in y_test] y_train = pd.DataFrame(y_train) y_val = pd.DataFrame(y_val) y_test = pd.DataFrame(y_test) for i in range(y_test.shape[1]): if float(max(y_val[i])) == 0 or float(max(y_train[i])) == 0 or float(max(y_test[i])) == 0: continue model = XGBRegressor( learn_rate=0.1, max_depth=4, # 4 min_child_weight=10, gamma=1, # 1 subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1 ) model.fit(X_train, [float(k) for k in y_train[i]], eval_set=[(X_val, [float(k) for k in y_val[i]])], eval_metric='rmse', early_stopping_rounds=300) y_pred = model.predict(X_test) y_test = y_test.astype('float') MSE = mean_squared_error(y_test[i], y_pred) RMSE = MSE ** 0.5 RMSEs.append(RMSE) return np.mean(RMSEs)
def main(): qresult = connect_db('solar.db', 'dip') smiles, compounds, gaps = get_data(qresult) mols = get_mols(smiles) fps_morgan, failed_mols = get_fingerprints(mols) refine_compounds(compounds, mols, gaps, failed_mols) compound_array = np.array(compounds) gaps_array = np.array(gaps) train_id, test_id, y_train, y_test = train_test_split(compound_array, gaps_array, test_size=0.20, random_state=0) train_fps = get_fp_from_id(compounds, fps_morgan, train_id) test_fps = get_fp_from_id(compounds, fps_morgan, test_id) xgb1 = XGBRegressor(n_estimators=2000, learning_rate=0.03, max_depth=7, colsample_bytree=0.6, nthread=8, scale_pos_weight=1, gamma=0, random_state=0, subsample=0.6, min_child_weight=3, early_stopping_rounds=10, reg_alpha=1) modelfit(xgb1, train_fps, y_train) #xgb1 = joblib.load('gbdt_dip_xgb.joblib') #joblib.dump(xgb1, 'gbdt_dip_xgb2.joblib') y_pred_cv = cvp(xgb1, train_fps, y_train, cv=4, n_jobs=8) y_train_pred = xgb1.predict(train_fps) y_pred_test = xgb1.predict(test_fps) train_df = pd.DataFrame() test_df = pd.DataFrame() train_df['id'] = pd.Series(train_id) train_df['dip_exp'] = pd.Series(y_train) train_df['dip_cv'] = pd.Series(y_pred_cv) train_df['dip_gbdt'] = pd.Series(y_train_pred) train_df['Group'] = 'Train' test_df['id'] = pd.Series(test_id) test_df['dip_exp'] = pd.Series(y_test) test_df['dip_cv'] = pd.Series(y_pred_test) test_df['dip_gbdt'] = pd.Series(y_pred_test) test_df['Group'] = 'Test' result_df = pd.concat([train_df, test_df]) result_df.to_csv('dip_xgb_train_test.csv') test_err = mean_squared_error(y_pred_test, y_test) print('Test error: {:4f}'.format(np.sqrt(test_err)))
class XGBRegressorMetaPrim(primitive): def __init__(self, random_state=0): super(XGBRegressorMetaPrim, self).__init__(name='XGBRegressorMeta') self.hyperparams = [] self.type = 'ensemble' self.description = "XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = XGBRegressor(random_state=self.random_state, n_jobs=5) self.accept_type = 'xgb' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def xgt_regressor(lr, max_d, estimators, X_train, X_test, y_train, y_test, obj): rmse = 10000 for i in lr: for j in max_d: for k in estimators: clf=XGBRegressor(learning_rate=i, n_estimators=k, max_depth=j, min_child_weight=1, gamma=1, subsample=0.5, colsample_bytree=0.8, objective=obj, nthread=4, scale_pos_weight=1, missing=np.nan) clf.fit(X_train,y_train) y_pred=clf.predict(X_test) a,b,c = pred_eval(y_pred,y_test) if a < rmse: b_lr = i b_d = j b_e = k rmse = a clf_b = clf return clf_b, (b_lr, b_d, b_e)
def xgboostmodel(self): df = pd.read_csv(datafile, encoding='utf-8', index_col=0) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.7) # list if self.params is None: params = {'max_depth': 80, 'n_estimators': 512} else: params = self.params raw_model = XGBRegressor(max_depth=128, n_estimators=768, learning_rate=0.01, silence=False) raw_model.fit(x_train, y_train) raw_model.save_model(self.model_file) pred = raw_model.predict(x_test) self.true = y_test self.pred = pred self.show_save_figure(fig_path=self.fig_path, modelname=self.job_name, detal_idx=500) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
def XGB_reg_evaluation(individual, evaluation_method='roll_win'): ''' evaluation_method : can be roll_win, mse ''' if evaluation_method == 'roll_win': trainNumber = individual[6] # the train num param = { 'eta': individual[0], 'silent': True, 'objective': "reg:linear", 'nthread': -1, 'min_child_weight': individual[1], 'max_depth': individual[2], 'subsample': individual[3], 'colsample_bylevel': individual[4], 'seed': 0 } roll_win_mseValue = 0 for i in xrange(N_validation): trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\ trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)] testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \ trainY[(trainNum - (i + 1) * window):(trainNum - i * window)] dtrain = xgb.DMatrix(data=trainingX, label=trainingY) bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=individual[5]) testingX = xgb.DMatrix(testingX) roll_win_mseValue += sum( (testingY - bst.predict(testingX))**2) / window roll_win_mseValue /= N_validation return (roll_win_mseValue, ) if evaluation_method == 'mse': ### The cross validation evaluation N_SPLITS = N_splits kf = KFold(n_splits=N_SPLITS) cv_mseValue = 0 fc = XGBRegressor(learning_rate=individual[0], n_estimators=individual[5], silent=True, objective="reg:linear", nthread=-1, gamma=0, min_child_weight=individual[1], max_depth=individual[2], subsample=individual[3], colsample_bylevel=individual[4], seed=0) for train, test in kf.split(trainX): fc.fit(trainX[train, :], trainY[train]) cv_mseValue += sum( (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test) cv_mseValue = cv_mseValue / N_SPLITS return (cv_mseValue, ) print "There is no evaluation method for %s" % evaluation_method raise Exception("evaluation_method is not valid")
def cbd_model(cbd_df,cbd_finalinput): ''' function that creates model from the cbd dataframe and returns the predicted number of crimes for the next three days ''' X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index', 'gdp_millions_2007', 'seasonally_adjusted_unemployment', 'unadjusted_unemployment', 'Possession, cocaine ', 'Heroin, possession ', 'Heroin Price Canada', 'day_segment_1200pm-1159pm', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday']] y_cbd=cbd_df['number_of_crimes'] scaler = StandardScaler() scaler.fit(X_cbd) # Don't cheat - fit only on training data X_cbd = scaler.transform(X_cbd) cbd_input_scaled = scaler.transform(cbd_finalinput) xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) xgb.fit(X_cbd,y_cbd) predict_cbd=xgb.predict(cbd_input_scaled) return predict_cbd
def get_ntree(): rmse_t_total, rmse_v_total = [], [] for ntree in range(10, 500, 10): xgb_base = XGBRegressor(objective='reg:linear', n_estimators=ntree, random_state=1234, silent=0, booster='gbtree', eval_metric='rmse') rmse_t_1, rmse_v_1 = [], [] print('此时 ntree = %s' % ntree) for train, test in get_cv(y=y_train, n_splits=5, random_state=42): X_t, y_t = X_train[train], y_train[train] X_v, y_v = X_train[test], y_train[test] xgb_base.fit(X_t, y_t) y_t_pre = xgb_base.predict(X_t) y_v_pre = xgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_1.append(rmse_t_each) rmse_v_1.append(rmse_v_each) rmse_t = np.mean(rmse_t_1) rmse_v = np.mean(rmse_v_1) rmse_t_total.append(rmse_t) rmse_v_total.append(rmse_v) return rmse_t_total, rmse_v_total
def xgboost_single_pred(self): x_train = self.x_train y_train = self.y_train x_test = self.x_test y_test = self.y_test self.y_pred_all_xgb = [] y_train = list(y_train) xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75) for i in range(len(x_test)): xgboost_clf.fit(x_train, y_train) x_test_one = x_test.iloc[i:i + 1] y_test_one = xgboost_clf.predict(x_test_one) self.y_pred_all_xgb.append(list(y_test_one)[0]) x_train = x_train.append(x_test_one) y_train.append(y_test[i]) xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb) xgboost_rmse = np.sqrt(xgboost_mse) y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb)) ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
def subfeat_stacking(train1,train2,test,sub=0.75,repeat=20): predictors = [x for x in train1.columns if x not in ['ID', 'y']] y_train2 = np.zeros((train2.shape[0], repeat)) y_test = np.zeros((test.shape[0], repeat)) for i in range(repeat): import random random.seed(i) random.shuffle(predictors) predictors_sub = predictors[:int(len(predictors)*sub)] model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) model.fit(train1[predictors_sub], train1['y']) y_train2[:, i] = model.predict(train2[predictors_sub]) y_test[:, i] = model.predict(test[predictors_sub]) return y_train2,y_test
def model_intrv3(Y_train, X_train, Y_test, X_test, Targ): global reslts global metrs import pandas as pd import numpy as np import datetime as dt import sklearn from sklearn.metrics import mean_squared_error from xgboost.sklearn import XGBRegressor from sklearn.metrics import mean_squared_error model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=0, subsample=0.9, colsample_bytree=1.0, loss='ls').fit(X_train, Y_train) model.score(X_test, Y_test) pred_Yxgb = model.predict(X_test) mse = mean_squared_error(Y_test, pred_Yxgb) nRMSE = np.sqrt(mse) / Targ.mean() # nRMSE=np.sqrt(mse)/max(Targ) Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb} Yts_pd = pd.DataFrame(Yts_pd) print(mse, nRMSE) metrs = {'mse': mse, 'nRMSE': nRMSE} reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd} return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
def over_sample(train, test, feat): predictors = [x for x in train.columns if x not in ['ID', 'y']] groups = list(train[feat].unique()) result = None for name in groups: train_temp = pd.concat([train, train[train[feat] == name]]) test_temp = test[test[feat] == name] model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) model.fit(train_temp[predictors], train_temp['y']) pred = model.predict(test_temp[predictors]) if result is None: result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred}) else: result = pd.concat([ result, pd.DataFrame({ 'ID': test_temp['ID'].values, 'y': pred }) ]) result.sort_values('ID', inplace=True) return result
class XGBWrapper_regr(object): """ A wrapper for xgboost model so that we will have a single api for various models. """ def __init__(self): self.model = XGBRegressor() def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None): self.model = self.model.set_params(**params) eval_set = [(X_train, y_train)] if X_valid is not None: eval_set.append((X_valid, y_valid)) if X_holdout is not None: eval_set.append((X_holdout, y_holdout)) self.model.fit(X=X_train, y=y_train, eval_set=eval_set, eval_metric='rmse', verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds']) scores = self.model.evals_result() self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()} # self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()} self.feature_importances_ = self.model.feature_importances_ def predict(self, X_test): return self.model.predict(X_test)
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1}, parameter_tuning=False): ''' if with_gpu: xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1) else: xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1) ''' X_train=data_dict['X_train'][feats_to_use].copy() y_train=data_dict['y_train'].copy() X_test=data_dict['X_test'][feats_to_use].copy() X_val=data_dict['X_val'][feats_to_use].copy() y_val=data_dict['y_val'].copy() if parameter_tuning: fit_params={ "early_stopping_rounds":10, "eval_metric" : "rmse", "eval_set" : [(X_val,y_val)]} xgb=XGBRegressor() train_val_features=pd.concat([X_train,X_val]) train_val_labels=pd.concat([y_train,y_val]) test_fold = np.zeros(train_val_features.shape[0]) # initialize all index to 0 test_fold[:X_train.shape[0]] = -1 # set index of training set to -1, indicating not to use it in validation ps=PredefinedSplit(test_fold=test_fold) X_train=data_dict['X_train'][feats_to_use] y_train=data_dict['y_train'] X_test=data_dict['X_test'][feats_to_use] grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1) start=time.time() grid.fit(train_val_features,train_val_labels) elapsed=time.time()-start print (elapsed) print ('best params:',grid.best_params_) print ('best score:',grid.best_score_) return grid.best_params_, grid.best_estimator_ else: xgb=XGBRegressor(**params) print (xgb) print ('start xgboost training') start=time.time() eval_set=[(X_val,y_val)] xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30) elapsed=time.time()-start print (elapsed) data_dict['y_pred']=np.exp(xgb.predict(X_test))-1 #generate submission data_dict['X_test']['item_cnt_month']=data_dict['y_pred'] test=pd.read_csv('test.csv') submission=pd.merge(test,data_dict['X_test'], on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']] return submission, xgb
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target): param = { 'n_estimators': 10, 'learning_rate': 0.01, } adj_params = { 'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000], 'learning_rate': [0.01, 0.1, 1] } xgbt = XGBRegressor(**param) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) cscv.fit(train_input, train_target) xgbt = XGBRegressor(**cscv.best_params_) xgbt.fit(train_input, train_target.ravel()) predicted = xgbt.predict(test_input) xgbt_base_rmse = np.sqrt(metrics.mean_squared_error( test_target, predicted)) print("xgbt_base_rmse: ", xgbt_base_rmse) #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) return xgbt_base_rmse
def build_model(train, test, pred, label, seed=1080, is_shuffle=True): train_pred = np.zeros((train.shape[0], )) test_pred = np.zeros((test.shape[0], )) n_splits = 10 # Kfold fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed) kf_way = fold.split(train[pred]) # params params = { 'booster': 'gbtree', 'objective': 'reg:gamma', 'gamma': 0.1, 'max_depth': 5, 'lamda': 3, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 3, 'silent': 1, 'eta': 0.1, 'seed': seed, 'nthread': 8, 'eval_meric': 'rmse' } # train for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1): train_x, train_y = train[pred].iloc[train_idx], train[label].iloc[ train_idx] valid_x, valid_y = train[pred].iloc[valid_idx], train[label].iloc[ valid_idx] # 数据加载 n_train = xgb.DMatrix(train_x, label=train_y) n_valid = xgb.DMatrix(valid_x, label=valid_y) xgbModel = XGBRegressor(max_depth=30, learning_rate=0.1, n_estimators=5, objective='reg:logistic', booster='gbtree', gamma=0.1, seed=seed) xgbModel.fit(train_x, train_y, verbose=True) train_pred[valid_idx] = xgbModel.predict(valid_x) test_pred += xgbModel.predict(test[pred]) / fold.n_splits test['label'] = test_pred return test[['loadingOrder', 'label']]
def alternative_minimization_xgboost(indexes_listeners_train, x_data_users, y_data_users, number_points_user, params, llambda, f): old_obj_val = 1e6 obj_val = -1e6 ridge_target = np.copy(y_data_users) ### Stopping criterion start_time = time.time() number_loops = -1 while (old_obj_val - obj_val) / old_obj_val > 1e-2: number_loops += 1 xgb_model = XGBRegressor(n_estimators=params['n_estimators'], learning_rate=params['learning_rate'], max_depth=params['max_depth'], subsample=params['subsample'], colsample_bytree=params['colsample_bytree']) xgb_model.fit(x_data_users, ridge_target) EB_residuals = [] aux = 0 for i in range(len(indexes_listeners_train)): y_data_user = y_data_users[indexes_listeners_train[i]] x_data_user = x_data_users[indexes_listeners_train[i]] EB_residual_user = np.mean( np.array(y_data_user) - xgb_model.predict(np.array(x_data_user))) / float( 1 + llambda / float(len(y_data_user))) EB_residuals.append(EB_residual_user) for idx in indexes_listeners_train[i]: ridge_target[idx] = y_data_users[idx] - EB_residual_user old_obj_val = obj_val obj_val = np.linalg.norm(ridge_target - xgb_model.predict( x_data_users))**2 + llambda * np.linalg.norm(EB_residuals)**2 #print 'Objval: '+str(obj_val), (old_obj_val- obj_val) / old_obj_val write_and_print('Number loops: ' + str(number_loops), f) write_and_print('Time alt-min: ' + str(time.time() - start_time), f) return xgb_model, EB_residuals
def cv_test(train, cv=5): t0 = time.time() target = 'y' predictors = [x for x in train.columns if x not in ['ID', 'y']] train_X = train[predictors] train_Y = train[target] mean_r2 = [] kf = KFold(len(train_Y), n_folds=cv, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf): x_train = train_X.iloc[train_index] x_test = train_X.iloc[test_index] y_train = train_Y.iloc[train_index] y_test = train_Y.iloc[test_index] lgb_model = LGBMRegressor(boosting_type='gbdt', num_leaves=10, max_depth=4, learning_rate=0.005, n_estimators=675, max_bin=25, subsample_for_bin=50000, min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=0.995, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, seed=0, nthread=-1, silent=True) xgb_model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) xgb_model.fit(x_train, y_train) pred = xgb_model.predict(x_test) from sklearn.metrics import r2_score score = r2_score(y_test, pred) mean_r2.append(score) print('{0}: r2:{1}\n\n'.format(i + 1, score)) print(u'r2-均值:%s' % (np.array(mean_r2).mean())) print('Done in %.1fs!' % (time.time() - t0)) return None
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate( cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join( SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor(learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true: ("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def XGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, xgb_params_all): xgb_param_contrl = {'early_stopping_rounds': 100} xgb_params = xgb_params_all.copy() objective_type = xgb_params['objective_type'] xgb_params.pop('objective_type') for k in xgb_param_contrl.keys(): if k in xgb_params: xgb_param_contrl[k] = xgb_params[k] xgb_params.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = XGBRegressor(**xgb_params) else: clf = XGBClassifier(**xgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) else: if objective_type == 'regressor': clf = XGBRegressor(**xgb_params) else: clf = XGBClassifier(**xgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) val_xgb_pre = clf.predict(X_valid, ntree_limit=clf.best_iteration) test_xgb_pre = clf.predict(X_test, ntree_limit=clf.best_iteration) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_xgb = myMetrics.metricsFunc(val_xgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_xgb_pre, test_xgb_pre, score_xgb
def model(df, alpha): X = df y = df.pop('y') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42) upper = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=1 - alpha / 2, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) lower = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=alpha / 2, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) median = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=.5, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) xgbls = XGBRegressor() xgbls.fit(X_train, y_train) mean = xgbls.predict(X_test) return pd.concat([ X_test.reset_index(drop=True), y_test.reset_index(drop=True), pd.DataFrame(upper, columns=['upper_bound']), pd.DataFrame(lower, columns=['lower_bound']), pd.DataFrame(mean, columns=['mean']), pd.DataFrame(median, columns=['median']) ], axis=1)
def xgbt_mode(train_input, train_target, test_input, test_target): param = { 'n_estimators': 1000, 'learning_rate': 0.01, 'objective': 'reg:squarederror', } xgbt = XGBRegressor(**param) xgbt.fit(train_input, train_target.ravel()) xgbt_predicted = xgbt.predict(test_input) return xgbt_predicted
def stacking(Data, Test, Target, FoldNum): BaseModel = [RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(), SVR()] EnsembleModel = XGBRegressor() # 模型初始化 Scale = StandardScaler() Data = pd.DataFrame(Scale.fit_transform(Data)) Test = test_process(Test) # 数据标准化和归一化 BaseTrainFold = [] BaseTestFold = [] BaseTargetFold = [] KF = KFold(n_splits=FoldNum) for TrainIndex, TestIndex in KF.split(Data): BaseTrainFold.append(Data.iloc[TrainIndex].reset_index(drop=True)) BaseTestFold.append(Data.iloc[TestIndex].reset_index(drop=True)) BaseTargetFold.append(Target.iloc[TrainIndex].reset_index(drop=True)) # 针对BaseModel进行数据集的划分 EnsembleTrainFold = [] EnsembleTestFold = [] Mark = 0 for Model in BaseModel: Mark += 1 TrainFold = [] TestFold = [] for Num in range(FoldNum): Clf = Model Clf.fit(BaseTrainFold[Num], BaseTargetFold[Num]) TrainFold.append(pd.DataFrame(data={"data" + str(Mark): Clf.predict(BaseTestFold[Num])})) TestFold.append(pd.DataFrame(data={"data" + str(Mark): Clf.predict(Test)})) if Num == FoldNum - 1: TrainTemp = TrainFold[0] TestTemp = TestFold[0] for Index in range(1, FoldNum): TrainTemp = TrainTemp.append(TrainFold[Index]) TestTemp = TestTemp.append(TestFold[Index]) TrainTemp.reset_index(inplace=True, drop=True) TestTemp.reset_index(inplace=True, drop=True) EnsembleTrainFold.append(TrainTemp) EnsembleTestFold.append(TestTemp) EnsembleTrain = EnsembleTrainFold[0] EnsembleTest = EnsembleTestFold[0] for Index in range(1, len(EnsembleTrainFold)): EnsembleTrain = pd.merge(EnsembleTrain, EnsembleTrainFold[Index], left_index=True, right_index=True) EnsembleTest = pd.merge(EnsembleTest, EnsembleTestFold[Index], left_index=True, right_index=True) # 第一层模型进行数据拟合和处理 EnsembleModel.fit(EnsembleTrain, Target) EnsembleResult = EnsembleModel.predict(EnsembleTest) Result = 0 for Num in EnsembleResult: Result += Num Result = Result / len(EnsembleResult) # 对测试集结果求平均值进行输出 return Result
def run_xgb(**args): print("building xgb model:") xgb_model = XGBRegressor() xgb_model.fit(args["training_data"], args["training_label"]) output = xgb_model.predict(args["test_data"]) pickle.dump(xgb_model, open("xgb_testmodel.p", "wb")) output = list(map(lambda e: round(e), output)) print(output) pickle.dump(output, open("xgb_output.p", "wb")) return output
def xgb(x_train, y_train, x_val, y_val): xgb = XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.01, subsample=0.8, colsample_bytree=0.8, random_state=2000) xgb.fit(x_train, y_train) result = xgb.predict(x_val) score = mean_absolute_error(result, y_val) return score
def xgboost_reg(train_df, target): if not os.path.isfile('Data/pickles/models/xgboost_model'): params = { 'n_estimators': [10, 20, 30, 40, 50, 100, 250, 500, 1000], 'max_depth': [1, 3, 5], 'learning_rate': [ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.09, 0.1, 0.3, 0.5, 0.7, 1 ], 'reg_alpha': [0, 0.001, 0.1, 0.5, 1, 2, 5], 'reg_lambda': [0, 0.001, 0.1, 1, 2, 5], 'n_jobs': [3], 'early_stopping_rounds': [6] } model = XGBRegressor(objective='reg:linear') grid = GridSearchCV(estimator=model, param_grid=params, verbose=3, cv=3, scoring='neg_root_mean_squared_error') grid.fit(train_df, target) print(grid.best_params_) with open('Data/pickles/models/xgboost_model', 'wb') as file: boost_model = grid.best_estimator_ pickle.dump(boost_model, file) else: with open('Data/pickles/models/xgboost_model', 'rb') as file: model = pickle.load(file) train_split_model = XGBRegressor(objective='reg:linear', learning_rate=0.08, max_depth=3, n_estimators=500, n_jobs=3, reg_alpha=0.001, reg_lambda=1) x_train, x_test, y_train, y_test = train_test_split(train_df, target) train_split_model.fit(x_train, y_train) y_pred = train_split_model.predict(x_test) '''best params: {'learning_rate': 0.08, 'max_depth': 3, 'n_estimators': 500, 'n_jobs': 3, 'reg_alpha': 0.001, 'reg_lambda': 1}''' print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test, y_pred)))) return model
def predict_tags(tag): training_dataset = pd.read_csv(CSV_DIR + 'train_' + tag) training_dataset = pd.get_dummies(training_dataset) test_dataset = pd.read_csv(CSV_DIR + 'test_' + tag) test_dataset = pd.get_dummies(test_dataset) X_train = training_dataset.drop(['outcome'], axis=1) y_train = training_dataset['outcome'] X_test = test_dataset xgbcl = XGBRegressor() xgbcl.fit(X_train, y_train) return (list(xgbcl.predict(X_test)))
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor( learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
class XGBoostRegressionModel(): def __init__(self, name): self.model = XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.001, random_state=0) def train(self, X, y, label, configs): X.reset_index() y.reset_index() distrs = [get_distribution(y)] index = ['Entire set'] tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 10) #CROSS VALIDATION CHANGE plt.figure(figsize=(10, 10)) outcome_df = pd.DataFrame() kf = KFold(n_splits=5) for train_index, test_index in kf.split(X): training_X, testing_X = X.iloc[train_index], X.iloc[test_index] training_y, testing_y = y.iloc[train_index], y.iloc[test_index] # Train, predict and Plot self.model.fit(training_X, training_y) #y_pred_rt = self.model.predict_proba(testing_X)[:, 1] y_pred_rt = self.model.predict(testing_X) mse = mean_squared_error(testing_y, y_pred_rt)**(0.5) performance_row = {"Mean Square Error": mse} outcome_df = outcome_df.append(performance_row, ignore_index=True) outcome_df.to_csv("Outcomes/" + label + "RegressionStudent.csv") distr_df = pd.DataFrame( distrs, index=index, columns=[f'Label {l}' for l in range(np.max(y) + 1)]) distr_df.to_csv(configs['model']['save_dir'] + "-K-Fold-Distributions.csv", index=True)
def train_xgb_model(best_nodes, X_train_scaled, Y_train): rsg = XGBRegressor(gamma=best_nodes["gamma"], max_depth=best_nodes["max_depth"], learning_rate=best_nodes["learning_rate"], min_child_weight=best_nodes["min_child_weight"], subsample=best_nodes["subsample"], colsample_bytree=best_nodes["colsample_bytree"], reg_alpha=best_nodes["reg_alpha"], reg_lambda=best_nodes["reg_lambda"], n_estimators=int(best_nodes["n_estimators"]), random_state=42) rsg.fit(X_train_scaled, Y_train) Y_pred = rsg.predict(X_train_scaled) print("mse:", np.mean((Y_pred - Y_train)**2)) print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2))) return rsg
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
pd.Series(rf_age_valid_pred).isnull().sum() statistics.mean(error_df.sqerr) len(rf_age_valid_pred) len(age_valid_Y) pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y) ## XGB for age prediction from xgboost.sklearn import XGBRegressor xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100, objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321) eval_set = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)] xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1) xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) ## ADAboost for age prediction from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321) ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,) ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) len(ada_age_valid_pred) len(age_valid_Y) error_df = pd.DataFrame(pd.Series(ada_age_valid_pred), columns=['pred']) error_df.reset_index( inplace=True) act_df = pd.DataFrame(age_valid_Y) act_df.reset_index( inplace=True)
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)
#parameters of algorithm xgb1=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.7, gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=7, min_child_weight=4, missing=None, n_estimators=1000, n_jobs=1, nthread=4, objective='reg:gamma', random_state=0, reg_alpha=0.09, reg_lambda=1, scale_pos_weight=1, seed=1048, silent=True, subsample=0.86) ################### #Fit the algorithm on the data xgb1.fit(train_x, train_y,eval_metric='rmse') ### train vs test pre = xgb1.predict(test_x) plt.figure(figsize=(16,9)) plt.style.use('ggplot') plt.plot(pre,label='predict_load1') plt.plot(np.array(test_y),label='test_load1') plt.title('Test_prediction MAE=%s' % str(np.sum(abs(pre-test_y))/len(pre))) plt.legend(loc='upper left') plt.savefig("D:\\load Forecasting\\plot\\TraiVsTest.jpg") #####Newdata pre1=xgb1.predict(load1_test.drop('load1',1)) plt.figure(figsize=(16,9)) plt.style.use('ggplot') plt.plot(pre1,label='predict_load1') plt.plot(np.array(load1_test['load1']),label='real_load1')