def build_model(train_data, test, pred, label, seed=2099, is_shuffle=True): train_pred = np.zeros((train_data.shape[0], )) test_pred = np.zeros((test.shape[0], )) n_splits = 5 # Kfold fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed) kf_way = fold.split(train_data[pred]) # params # test_x=np.concatenate([test[pred].values,geohash_test],axis=1) # train for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1): train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[ label].iloc[train_idx] valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[ label].iloc[valid_idx] # geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx] # train_x=np.concatenate([train_x,geohash_tr_x],axis=1) # valid_x=np.concatenate([valid_x,geohash_val_x],axis=1) # 数据加载 clf = LGBMRegressor( learning_rate=0.5, n_estimators=6000, boosting_type='gbdt', objective='regression', num_leaves=156, subsample=0.8, njobs=-1, max_depth=6, reg_lambda=0, colsample_bytree=0.8, random_state=2019, # 2019 metric=['mse']) clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=['mse'], categorical_feature='auto', early_stopping_rounds=100, verbose=100) train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_) test_pred += clf.predict( test[pred], num_iteration=clf.best_iteration_) / fold.n_splits print('mean_squared_error:', mean_squared_error(train_data[label].values, train_pred)) test['label'] = test_pred return test[['loadingOrder', 'label']], clf
class LGBMRegressorPrim(primitive): def __init__(self, random_state=0): super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor') self.hyperparams = [] self.type = 'Regressor' self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = LGBMRegressor() self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def get_ntree(): rmse_t_total, rmse_v_total = [], [] for ntree in range(10, 500, 10): lgb_base = LGBMRegressor(n_estimators=ntree, objective='regression', random_state=1234, n_jobs=2, colsample_bytree=0.8, reg_alpha=1, max_depth=10, subsample=0.8) print('此时 ntree = %s' % ntree) lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_total.append(rmse_t_each) rmse_v_total.append(rmse_v_each) myfile = open('D:\\workspace python\\statContest\\save\\' + 'lgbbase2_rmse_0412.txt', 'a', encoding='utf-8') print(rmse_t_each, ',', rmse_v_each, file=myfile) myfile.close() return rmse_t_total, rmse_v_total
def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099,est=6000, is_shuffle=True): train_x,train_y=train_data[features].values,train_data[label].values clf=LGBMRegressor( learning_rate=0.01, boosting_type = 'gbdt', objective = 'regression', n_estimators=est, num_leaves=156, subsample=0.8, njobs=-1, max_depth=8, reg_lambda=0, colsample_bytree=0.8, random_state=2019, # 2019 metric=['mse']) clf.fit( train_x, train_y, eval_set=[(train_x, train_y)], eval_metric=['mse'], categorical_feature='auto', verbose=100) #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_) test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_) #print('mean_squared_error:',mean_squared_error(train_y,train_pred)) test['label'] = test_pred return test[['loadingOrder', 'label']],clf
def train_lightgbm(verbose=True): """Train a boosted tree with LightGBM.""" if verbose: print("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) x = np.array([ np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))), axis=0).flatten() for id in df['id'].tolist() ]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split( x, y, random_state=42, stratify=y, test_size=0.20) ''' params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2'}, 'num_leaves': 21, 'learning_rate': 0.001, 'nthread':24, 'subsample':0.80, 'colsample_bytree':0.80, 'seed':42, 'verbose': verbose, } ''' skf = StratifiedKFold(n_splits=5, random_state=2048, shuffle=True) result = [] clfs = [] oof_preds = [] for train_index, test_index in skf.split(x, y): trn_x, val_x = x[train_index, :], x[test_index, :] trn_y, val_y = y[train_index], y[test_index] val_ids = pd.DataFrame(ids.iloc[test_index].values, columns=['id']) clf = LGBMRegressor(max_depth=50, num_leaves=21, n_estimators=5000, min_child_weight=1, learning_rate=0.001, nthread=24, subsample=0.80, colsample_bytree=0.80, seed=42) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='l2', early_stopping_rounds=300) val_preds = pd.DataFrame(clf.predict(val_x), columns=["cancer"]) oof_preds.append(pd.concat([val_ids, val_preds], axis=1)) clfs.append(clf) return clfs, oof_preds
def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all): lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'} lgb_param = lgb_param_all.copy() objective_type = lgb_param['objective_type'] lgb_param.pop('objective_type') for k in ['early_stopping_rounds', 'categorical_feature']: if k in lgb_param: lgb_param_contrl[k] = lgb_param[k] lgb_param.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_) test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_lgb_pre, test_lgb_pre, score_lgb
def lightGBM_train_nocross(j,param,x_train, x_test, y_train, y_test): gbm = LGBMRegressor(**param,num_leaves=31,learning_rate=0.01,object='regression') gbm.fit(x_train, y_train) y_pred = gbm.predict(x_test) y_pred = DataFrame(y_pred) rmse_lightGBM.append(np.sqrt(mean_squared_error(y_pred, y_test))) r2_lightGBM.append(r2_score(y_test, y_pred)) return rmse_lightGBM,r2_lightGBM,gbm
def lgb(x_train, y_train, x_val, y_val): lgb = LGBMRegressor(n_estimators=1000, max_depth=10, subsample=0.8, colsample_bytree=0.8, learning_rate=0.01, random_state=2020) lgb.fit(x_train, y_train) result = lgb.predict(x_val) score = mean_absolute_error(result, y_val) return score
def get_model_result(self, params: dict) -> dict: X, y = self.X, self.Y X_test, y_test = self.X_test, self.Y_test # X, y = self.X.values, self.Y.values # X_test, y_test = self.X_test.values, self.Y_test.values if isinstance(self.estimator, lgb.Booster): params["metric"] = "auc" estimator = lgb.train(params, self.dataset_train) pred_train = pd.Series(estimator.predict(self.dataset_train), index=self.X.index) pred_test = pd.Series(estimator.predict(self.dataset_test), index=self.X_test.index) elif isinstance(self.estimator, LGBMRegressor): estimator = LGBMRegressor(**params) estimator.fit(X, y, eval_metric="auc") pred_train = pd.Series(estimator.predict(X), index=self.X.index) pred_test = pd.Series(estimator.predict(X_test), index=self.X_test.index) elif isinstance(self.estimator, LGBMClassifier): estimator = LGBMClassifier(**params) estimator.fit(X, y, eval_metric="auc") pred_train = pd.Series(estimator.predict_proba(X)[:, 1], index=self.X.index) pred_test = pd.Series(estimator.predict_proba(X_test)[:, 1], index=self.X_test.index) else: raise TypeError( "Input model should be a `lgb.Booster` or `LGBMClassifier`/`LGBMRegressor`!" ) # 置空得分 pred_train.loc[~pred_train.index.isin(self.hit_indices)] = np.nan pred_test.loc[~pred_test.index.isin(self.hit_indices)] = np.nan # 计算模型评估指标 ks_train, ks_test = calc_ks(-pred_train, y), calc_ks(-pred_test, y_test) auc_train, auc_test = calc_auc(pred_train, y), calc_auc(pred_test, y_test) # return {'train': (ks_train, auc_train), 'test': (ks_test, auc_test)} return {"ks": (ks_train, ks_test), "auc": (auc_train, auc_test)}
def get_model(brand_string, train_brand, test_brand): brand1 = pd.read_csv(brand_string) brand1 = brand1.iloc[90:, :].reset_index(drop=True) X_brand1 = brand1.drop(['brand', 'cnt'], axis=1) y_train = brand1['cnt'].values X_train = pd.concat([X_brand1, train_brand], axis=1) X_test = test.drop(['cnt'], axis=1) X_test = pd.concat([X_test, test_brand], axis=1) model = LGBMRegressor().fit(X_train, y_train) brand1_pre = model.predict(X_test) return brand1_pre
def tune_params(): rmse_t_total, rmse_v_total = [], [] for max_depth in range(6, 11): for subsample in [0.6, 0.7, 0.8]: for colsample_bytree in [0.6, 0.7, 0.8]: for reg_alpha in [0.1, 1, 10]: lgb_base = LGBMRegressor(n_estimators=150, objective='regression', random_state=1234, n_jobs=3, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, max_depth=max_depth, subsample=subsample) _params = { 'max_depth': max_depth, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'reg_alpha': reg_alpha, } lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_total.append(rmse_t_each) rmse_v_total.append(rmse_v_each) print(_params) myfile1 = open( 'D:\\workspace python\\statContest\\save\\' + 'lgbbase2_saveparams_rmse_0412.txt', 'a', encoding='utf-8') print(_params['max_depth'], _params['subsample'], _params['colsample_bytree'], _params['reg_alpha'], file=myfile1) myfile1.close() print(rmse_t_each, rmse_v_each) myfile = open('D:\\workspace python\\statContest\\save\\' + 'lgbbase2_tunparms_rmse_0412.txt', 'a', encoding='utf-8') print(rmse_t_each, ',', rmse_v_each, file=myfile) myfile.close() return rmse_t_total, rmse_v_total
def predict(X_train, Y_train, X_test): print("Y_train is 1:", Y_train.count(1)) print("Y_train is 0:", Y_train.count(0)) clfs = [ LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=80), XGBRegressor(learning_rate=0.0475, max_depth=4, n_estimators=300)] X = np.array(X_train, dtype='float32') y = np.array(Y_train, dtype='float32') X_predict = np.array(X_test, dtype='float32') dataset_blend_train = np.zeros((X.shape[0], len(clfs)), dtype='float32') dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)), dtype='float32') '''5折stacking''' n_folds = 5 skf = StratifiedKFold(n_splits=n_folds) for j, clf in enumerate(clfs): '''依次训练各个单模型''' print("clf", j) dataset_blend_test_j = np.zeros((X_predict.shape[0], n_folds), dtype='float32') for i, (train, test) in enumerate(skf.split(X, y)): '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。''' print("stacking Fold", i) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] # if j == 0: # class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # clf.class_weight = dict(enumerate(class_weights)) # else: # class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # clf.scale_pos_weight = class_weights[1] / class_weights[0] # print('scale_pos_weight:', clf.scale_pos_weight) clf.fit(X_train, y_train) y_submission = clf.predict(X_test) dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict(X_predict) '''对于测试集,直接用这k个模型的预测值均值作为新的特征''' dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) del dataset_blend_test_j # print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j])) # clf = LogisticRegression() # clf = GradientBoostingRegressor(learning_rate=0.02, max_depth=6) clf = LGBMRegressor() class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y) clf.class_weight = dict(enumerate(class_weights)) dataset_blend_train = np.append(dataset_blend_train, X, axis=1) dataset_blend_test = np.append(dataset_blend_test, X_predict, axis=1) clf.fit(dataset_blend_train, y) y_submission = clf.predict(dataset_blend_test) return y_submission
def train_lgb_model(best_nodes, X_train_scaled, Y_train): rsg = LGBMRegressor( learning_rate=best_nodes["learning_rate"], n_estimators=int(best_nodes["n_estimators"]), max_depth=best_nodes["max_depth"], #eval_metric=best_nodes["eval_metric"], num_leaves=best_nodes["num_leaves"], subsample=best_nodes["subsample"], colsample_bytree=best_nodes["colsample_bytree"], min_child_samples=best_nodes["min_child_samples"], min_child_weight=best_nodes["min_child_weight"]) rsg.fit(X_train_scaled, Y_train) Y_pred = rsg.predict(X_train_scaled) print("mse:", np.mean((Y_pred - Y_train)**2)) print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2))) return rsg
def lightBGM_model_with_test(X, Y): model = LGBMRegressor(num_leaves=36, n_estimators=100, learning_rate=0.07, random_state=0) useful_feature = get_useful_features_byLightBGM(X, Y) X_U = X[useful_feature] x1, x2, y1, y2 = train_test_split(X_U, Y, test_size=0.2) y1_log = np.log1p(y1) model.fit(x1, y1_log, verbose=True) predict_log = model.predict(x2) predict = np.expm1(predict_log) error = error_fun(predict, y2)[1] del x1, x2, y1, y2 return error
def fit(self): if self.First_change: #boxcox变换 act = boxcox(self.train_label + 0.1)[0] self.act_ = boxcox(self.train_label + 0.1)[1] else: act = self.train_label steps = self.steps actual = act n_samples = len(self.train_label) y_pred_train = np.zeros(n_samples, np.float32) n_estimators_list = self.n_estimators_list for i in range(1): num = np.random.randint(0, 5000) print("----training begin----") for step in range(steps): print(step) actual = actual - y_pred_train #残差计算 if step > 0: #残差进行标签压缩变换,和boxcox变换 actual_ = sigmod(actual) actual_box = boxcox(actual_)[0] actual_box_val = boxcox(actual_)[1] self.box_value.append(actual_box_val) actual_used = actual_box else: actual_used = actual #阶段模型生成 model = LGBMRegressor(n_estimators=n_estimators_list[step], max_depth=3, learning_rate=0.02, subsample=1, colsample_bytree=1) model.fit(self.train.values, actual_used) #阶段模型训练 y_pred_train_ = model.predict(self.train.values) #阶段预测输出 if step > 0: #阶段反变换计算输出 y_pred_train = (y_pred_train_ * actual_box_val + 1)**(1 / actual_box_val) y_pred_train = sigmod_trans(y_pred_train) else: y_pred_train = y_pred_train_ self.model_list.append(model) #阶段模型存储
def modelingLGBM(hold_out_train,hold_out_test): from sklearn.linear_model import LassoCV as LaCV from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.linear_model import Ridge from sklearn.linear_model import RANSACRegressor from sklearn.neural_network import MLPRegressor as MLP from xgboost.sklearn import XGBRegressor as XGBR from xgboost.sklearn import DMatrix from lightgbm.sklearn import LGBMRegressor as LGBM traindata=hold_out_train.copy() testdata=hold_out_test.copy() traindata=traindata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1) testdata=testdata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1) train_x=traindata.drop(['Sales'],axis=1) train_y=np.log1p(traindata['Sales']) test_x=testdata.drop(['Sales'],axis=1) # #归一化 # min_max_scaler = MinMaxScaler() # train_x = min_max_scaler.fit_transform(train_x) # test_x = min_max_scaler.fit_transform(test_x) smalest_rmspe=1000 subsamples=np.arange(0.5,0.6,0.1) for subsample in subsamples: time1 = time.time() lgbmModel = LGBM(n_estimators=8000,subsample=0.8) print(lgbmModel) lgbmModel.fit(train_x, train_y) sales_predict = lgbmModel.predict(test_x) rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict)) print(rmspe) time2 = time.time() print('耗费时间:', (time2 - time1)) if smalest_rmspe>rmspe: smalest_rmspe=rmspe best_model=lgbmModel return best_model
class b_model: # 这个地方可以定义全局变量 params = { 'learning_rate': 0.015, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mse', 'num_leaves': 12, 'max_depth': 9, 'max_bin': 130, 'feature_fraction': 0.9, 'reg_lambda': 50, 'min_data': 25, 'min_child_weight': 0.001, 'verbose': -1, } no_use = [ "血糖", "blood_sugar", "id", "blood_sugar_log", '体检日期', 'feature_5_less_25', 'feature_4_less_60', '性别' ] def __init__(self): # 在创建类的时候需要哪些参数 self.model = LGBMRegressor(learning_rate=0.015, objective="regression", metric='mse', num_leaves=12, max_depth=9, max_bin=130, feature_fraction=0.9, reg_lambda=50, min_data=25, min_child_weight=0.001, num_boost_round=3000, random_state=42) def __make_feature(self, train, test): # 构造特征 if train.empty: test['性别'] = test['性别'].map({'男': 1, '女': 0, '??': 1}) return test if test.empty: train['性别'] = train['性别'].map({'男': 1, '女': 0, '??': 1}) return train else: train_id = train.id.values.copy() test_id = test.id.values.copy() data = pd.concat([train, test]) data['性别'] = data['性别'].map({'男': 1, '女': 0, '??': 1}) train_feat = data[data.id.isin(train_id)] test_feat = data[data.id.isin(test_id)] return train_feat, test_feat def fit(self, X, y=None): X.drop(X[X["年龄"] >= 84].index, inplace=True) fea_train = pd.read_csv("./feature/fea_train.csv") fea_train1 = pd.read_csv("./feature/fea_train_1.csv") fea_train2 = pd.read_csv("./feature/fea_train_2.csv") X = pd.merge(X, fea_train, how="left", on="id") X = pd.merge(X, fea_train1, how="left", on="id") X = pd.merge(X, fea_train2, how="left", on="id") X = self.__make_feature(train=X, test=pd.DataFrame()) if y == None: y = X["血糖"].values predictors = [f for f in list(X.columns) if f not in self.no_use] X_train, X_test, y_train, y_test = train_test_split(X[predictors], y, test_size=0.1, random_state=42) self.model.fit(X_train[predictors], y_train, eval_metric="mse", early_stopping_rounds=100, verbose=100, eval_set=(X_test[predictors], y_test)) from sklearn.metrics import mean_squared_error print("线下误差:{}".format(0.5 * mean_squared_error( y_test, self.model.predict(X_test[predictors])))) return self def predict(self, X): # 对测试集进行预测,传入模型,和测试数据 fea_test = pd.read_csv("./feature/fea_test.csv") fea_test1 = pd.read_csv("./feature/fea_test_1.csv") fea_test2 = pd.read_csv("./feature/fea_test_2.csv") X = pd.merge(X, fea_test, how="left", on="id") X = pd.merge(X, fea_test1, how="left", on="id") X = pd.merge(X, fea_test2, how="left", on="id") X = self.__make_feature(test=X, train=pd.DataFrame()) predictors = [f for f in list(X.columns) if f not in self.no_use] test_pred = self.model.predict(X[predictors]) print("最大值:{}".format(test_pred.max())) return test_pred def get_params(self): return self.params
""" from lightgbm.sklearn import LGBMRegressor from xgboost.sklearn import XGBRegressor from sklearn import ensemble from sklearn.metrics import mean_squared_error,mean_absolute_error import pandas as pd data = pd.read_csv('original_train.csv') test = pd.read_csv('original_test.csv') nn_train = pd.read_csv('nn_train_7day.csv') nn_test = pd.read_csv('nn_test_7day.csv') nn_train = nn_train[['nn_4', 'nn_8', 'nn_14', 'nn_7', 'nn_18', 'nn_16', 'nn_22', 'nn_15', '']] y_train = data.loc[90:,'count1'].values y_test = test['count1'] model = LGBMRegressor().fit(nn_train,y_train) y_pre = model.predict(nn_test) print(mean_squared_error(y_pre,y_test)) print(mean_absolute_error(y_pre,y_test)) print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), nn_train.columns), reverse=True))
learning_rate=0.1, num_leaves=255, subsample=0.8, colsample_bytree=0.8, random_state=2020, metric='RMSE', n_jobs=24, ) clf.fit( X_trn, Y_trn, eval_set=[(X_val, Y_val)], early_stopping_rounds=200, verbose=1000, ) oof[val_idx] = clf.predict(X_val) sub += clf.predict(X_test) / skf.n_splits sub = pd.DataFrame({ 'queryid': test.query_id, 'documentid': test.doc_id, 'predict_label': sub, }) oof = pd.DataFrame({ 'query_id': train.query_id, 'doc_id': train.doc_id, 'oof': oof, 'label': train.label, })
ks_auc=pd.DataFrame() for feature_num in feature_num_range: #计算在不同的feature_num下xgb模型在测试集的KS和AUC表现 chosen_feature=feat_imp.index[:feature_num] #选取feature_importance排在前feature_num的变量 lgbm_model.set_params(n_estimators=500) lgbm_param_temp = lgbm_model.get_params() lgbm_train = lgb.Dataset(X.loc[:,chosen_feature],Y) cvresult = lgb.cv(lgbm_param_temp, lgbm_train, num_boost_round=lgbm_param_temp['n_estimators'],nfold=5,metrics='auc',early_stopping_rounds=100) best_n_estimators_temp=len(cvresult['auc-mean']) lgbm_model.set_params(n_estimators=best_n_estimators_temp) lgbm_model.fit(X.loc[:,chosen_feature],Y,eval_metric='auc') preds=lgbm_model.predict(P_test.loc[:,chosen_feature]) ks_value,bad_percent,good_percent=pf.cal_ks(-preds,y_test,section_num=20) false_positive_rate,recall,thresholds = roc_curve(y_test, preds) roc_auc=auc(false_positive_rate,recall) ks_auc=pd.concat([ks_auc,pd.DataFrame([np.max(ks_value),roc_auc]).T]) ks_auc.columns=['ks','auc'] ks_auc.index=feature_num_range print(ks_auc) '''final_feature_num可以选取KS和AUC相加最高的,也可以手动指定'''
unuseful_feature.append(i[1]) use_features = [aa for aa in features if aa not in unuseful_feature] print('有用:', len(use_features)) print('无用:', len(unuseful_feature)) print('全部:', len(features)) train_X_1 = train_[use_features] x1, x2, y1, y2 = train_test_split(train_X_1, train_y_1, test_size=0.2) model_1 = LGBMRegressor(learning_rate=0.07, num_leaves=41, n_estimators=110, random_state=0) model_1.fit(x1, y1.values.ravel(), verbose=True) val_1 = model_1.predict(x2) ''' preds_1 = model_1.predict(test_X) ''' print(error_(val_1, y2)) val_1_error = error_(val_1, y2) del x1, x2, y1, y2 gc.collect() ############################################################## train_y_2 = train_['舒张压'] model_2 = LGBMRegressor(num_leaves=36, n_estimators=140, random_state=0,
results['RMSLE'] = np.sqrt(-1 * results['score']) results = results.sort_values('RMSLE') return results param_grid = { 'n_estimators' : [50, 100], 'max_depth' : [1, 10] } model = LGBMRegressor(random_state=random_state) my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5) model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state) model.fit(train, y) prediction = model.predict(test) prediction prediction = np.expm1(prediction) prediction submission = pd.read_csv('sample_submission.csv') submission.head() submission['price']=prediction submission.head() submission_csv_path = ('submission_{}_RMSLE_{}.csv'.format('lgbm', '0.164399')) submission.to_csv(submission_csv_path, index = False) print(submission_csv_path)
def reg_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) ffill on NaN from training data, Replaces NaN in test data with ffill, cat-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, uses LightGBM """ # print("throwing away rows to speed up model") # speed up testing by throwing away some data # clean_labelled = labelled_data.sample(frac=0.2) clean_labelled = labelled_data.copy() clean_unlabelled = unlabelled_data.copy() print("cleaning data...") # get rid of weird value clean_labelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( labelled_data[ "Work Experience in Current Job [years]"], errors="coerce") clean_unlabelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( unlabelled_data[ "Work Experience in Current Job [years]"], errors="coerce") print("mixed type issue fixed..") # fix additional income field clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_labelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_unlabelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") # dropping useless columns drop_columns(clean_unlabelled) drop_columns(clean_labelled) # removing NaN values clean_labelled.fillna(method="ffill", inplace=True) clean_unlabelled = clean_unlabelled[all_columns] clean_unlabelled.fillna(method="ffill", inplace=True) # input data for final predictions unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split, and separating targets split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("encoding categorical data...") # categorical encoding cat = CatBoostEncoder() train_data = cat.fit_transform(train_data, train_target) test_data = cat.transform(test_data) unknown_data = cat.transform(unknown_data) # separate additional income train_add_income = train_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values test_add_income = test_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values unknown_add_income = unknown_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values train_data = train_data[no_income_columns] test_data = test_data[no_income_columns] unknown_data = unknown_data[no_income_columns] train_target = train_target[ "Total Yearly Income [EUR]"].values - train_add_income test_target = test_target["Total Yearly Income [EUR]"].values print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("fitting model...") # fit model reg = LGBMRegressor() # reg = TransformedTargetRegressor( # regressor=mod, # transformer=scaler # ) reg.fit(train_data, train_target) print("predicting test data...") test_result = reg.predict(test_data, num_iterations=15000) # add additional income test_result = test_result + test_add_income print("analysing test results...") # validate test error = mean_absolute_error(test_target, test_result) score = explained_variance_score(test_target, test_result) print("Mean absolute error of test data: ", error) print("Score: ", score) print("predicting unknown data...") # predict and format values = reg.predict(unknown_data) values = values + unknown_add_income results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Total Yearly Income [EUR]": values }) print("Finished.") return results
n_jobs=24, ) clf.fit( X_trn, Y_trn, sample_weight=W_trn, eval_set=[(X_val, Y_val)], eval_metric='rmse', eval_sample_weight=[W_val], early_stopping_rounds=200, categorical_feature=category_feats, verbose=100, ) oof[val_idx] = clf.predict(X_val) sub += clf.predict(X_sub) / gkf.n_splits feat_imp_df['imp'] += clf.feature_importances_ / gkf.n_splits # In[ ]: pred_sub = search_f1(df_train.label, oof, sub) # In[ ]: plt.figure(figsize=(15, 30)) feat_imp_df = feat_imp_df.sort_values('imp', ignore_index=True) sns.barplot(x='imp', y='feat', data=feat_imp_df) plt.savefig('imp.png') # In[ ]:
with open(os.path.join(folder, "clf_A.pkl"), 'wb') as file: pickle.dump(clf_A, file) with open(os.path.join(folder, "clf_B.pkl"), 'wb') as file: pickle.dump(clf_B, file) with open(os.path.join(folder, "vectorizers.pkl"), 'wb') as file: pickle.dump(vectorizers, file) elif sys.argv[1] == "load": print("Loading") with open(os.path.join(folder, "clf_A.pkl"), 'rb') as file: clf_A = pickle.load(file) with open(os.path.join(folder, "clf_B.pkl"), 'rb') as file: clf_B = pickle.load(file) with open(os.path.join(folder, "vectorizers.pkl"), 'rb') as file: vectorizers = pickle.load(file) print("Loading test") test, *_ = process_data("../data/test/", train=False, vectorizers=vectorizers) T = test[features].values T = np.stack([np.concatenate(T[i]) for i in range(T.shape[0])]) print("Predicting") pred_A = clf_A.predict(T) pred_B = clf_B.predict(T) pd.DataFrame(np.stack([pred_A, pred_B]).T, index=test.index, columns=["Alice", "Bob"]).to_csv("../submitions/answer-B.csv")
objective='reg:linear', min_child_weight=6, n_estimators=1000, max_depth=7, colsample_bytree=0.6) xgb.fit(X_train, y_train) xgb_pred = xgb.predict(X_test) accuracy = xgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, xgb_pred) mean_squared_error(y_test, xgb_pred) np.sqrt(mean_squared_error(y_test, xgb_pred)) lgb = LGBMRegressor(objective='regression') lgb.fit(X_train, y_train) lgb_pred = lgb.predict(X_test) accuracy = lgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, lgb_pred) mean_squared_error(y_test, lgb_pred) np.sqrt(mean_squared_error(y_test, lgb_pred)) from yellowbrick.regressor import ResidualsPlot visualizer = ResidualsPlot(xgb) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer rid_pred_t = rid.predict(X_train) la_pred_t = la.predict(X_train) plt.scatter(la_pred_t, y_train, c="blue", marker="s", label="Training data")
x_train = all_data[:2888] x_test = all_data[2888:] #val and train x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=42) ######################################################################## print('start ML') score = [] train_range = range(1, 1000, 10) for i in train_range: print(i) lgr = LGBMRegressor(learning_rate=0.05, n_estimators=i, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, random_state=2019) lgr.fit(x_train, y_train) mse = mean_squared_error(y_val, lgr.predict(x_val)) # print(mse) score.append(mse) plt.plot(train_range, score) result = pd.DataFrame(lgr.predict(x_test)) result.to_csv('sub_8-6.txt', index=False, header=0)
print("BEST PARAMETERS: " + str(best_params)) # Print best CV score scores = [-trial['result']['loss'] for trial in trials.trials] print("BEST CV SCORE: " + str(np.max(scores))) # Print execution time tdiff = trials.trials[-1]['book_time'] - trials.trials[0]['book_time'] print("ELAPSED TIME: " + str(tdiff.total_seconds() / 60)) # Set params est.set_params(**best_params) # Fit est.fit(X_train, y_train) y_pred = est.predict(X_test) # Predict score = r2_score(y_test, y_pred) print("R2 SCORE ON TEST DATA: {}".format(score)) #============================================================================== # Tree structure of hyperparameter space (Optional) #============================================================================== # You must change the evaluate function in order to extract learning rate # and n_estimators from choices. Please add the following code to the start of # evaluate function # # Choices # if 'choices' in params.keys(): # params['learning_rate'] = params['choices']['learning_rate'] # params['n_estimators'] = params['choices']['n_estimators']
for i in range(len(duration)): duration_hours.append(int(duration[i].split(sep = "h")[0])) duration_mins.append(int(duration[i].split(sep = "m")[0].split(sep = "h")[-1])) X["Duration_hours"] = duration_hours X["Duration_mins"] = duration_mins X.drop(["Duration"], axis = 1,inplace = True) X.drop(["Dep_Time"], axis = 1,inplace = True) X.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) from lightgbm.sklearn import LGBMRegressor reg = LGBMRegressor() reg.fit(X_train,y_train) y_pred=reg.predict(X_test) from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, y_pred)) print('MSE:', metrics.mean_squared_error(y_test, y_pred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) filename = 'flightfare.pkl' pickle.dump(reg, open(filename, 'wb'))
def modelIntergrated( hold_out_train,hold_out_test,test): traindata = hold_out_train.copy().drop(['Store', 'Customers', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1) train_x = traindata.drop(['Sales'], axis=1) train_y = np.log1p(traindata['Sales']) testdata = hold_out_test.copy() testdata = testdata.drop(['Store', 'Customers', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1) ho_test_x = testdata.drop(['Sales'], axis=1) ho_test_y=testdata['Sales'] finaltest_x = test.copy().drop(['Id', 'Store', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1) predictions=[] RMSPES=[] start = time.time() k=5 for i in range(0,k): lgbmModel = LGBM(n_estimators=8000, subsample=0.8,random_state=i) lgbmModel.fit(train_x,train_y) sales_predict = lgbmModel.predict(ho_test_x) final_predict = lgbmModel.predict(finaltest_x) test['sales_predict'] = np.expm1(final_predict) smallest_rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict)) print(smallest_rmspe) hold_out_test['sales_predict'] = np.expm1(sales_predict) res = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict']] # res2 = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict']] # showFigure(res) res.loc[:, 'errorabs'] = abs((res['sales_predict'] - res['Sales']) / res['Sales']) res.loc[:, 'error'] = ((res['sales_predict'] - res['Sales']) / res['Sales']) res.sort_values(['errorabs'], ascending=False, inplace=True) # print(res[res['error']>=0].count()) # print(res[res['error'] <= 0].count()) # b_w = 0.900 # for i in range(1, 101): # predict = sales_predict * (0.900 + i / 1000) # rmspe = RMSPE(testdata['Sales'], np.expm1(predict)) # if rmspe < smallest_rmspe: # b_w = 0.900 + i / 1000 # smallest_rmspe = rmspe # res2.loc[:, 'sales_predict'] = np.expm1(predict) # print(smallest_rmspe) # print(b_w) # showFigure(res2) stores = range(1, 1116) hold_out_test['w'] = 1 for store in stores: s1 = pd.DataFrame(hold_out_test[hold_out_test['Store'] == store],columns=['Store', 'Date', 'Sales', 'sales_predict']) s = [] for i in range(1, 201): error = RMSPE(s1.Sales, s1.sales_predict * (0.800 + i / 1000)) s.append(error) score = pd.Series(s, index=[(0.800 + i / 1000) for i in range(1, 201)]) BS = score[score.values == score.values.min()] a = np.array(BS.index.values) hold_out_test.loc[hold_out_test['Store'] == store, 'w'] = a test.loc[test['Store'] == store, 'w'] = a res3 = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict', 'w']] res3['sales_predict'] = hold_out_test['sales_predict'] * hold_out_test['w'] RMSPES.append(RMSPE(res3['Sales'], res3['sales_predict'])) finalres = test[['Id']] finalres['Sales'] = test['predict_sales'] * test['w'] predictions.append(finalres['Sales']) print(RMSPES) finalres = test[['Id']] finalres['Sales']=0 for i in range(0,k): finalres['Sales']+=predictions[i] finalres['Sales']=finalres['Sales']/k end = time.time() print((end-start)) finalres.to_csv('../submissionResult/submissionResult_lightGBM_mean.csv', index=False)