def LGBR_optimization(train, bindingEnergy_train, test, bindingEnergy_test, cpus): ''' Parameter optimization for the Light Gradient Boosting Regressor. ''' lr_list = [0.01, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] R2_train = [] R2_test = [] best_test_R2 = None best_train_R2 = None best_lr = None train_light = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) test_light = test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) for learning_rate in lr_list: lgb_reg = LGBMRegressor( n_estimators=10000, learning_rate=learning_rate, max_depth=15, random_state=0, n_jobs=cpus), bindingEnergy_train) R2_train.append(lgb_reg.score(train_light, bindingEnergy_train)) R2_test.append(lgb_reg.score(test_light, bindingEnergy_test)) best_test_R2 = max(R2_test) best_train_R2 = R2_train[R2_test.index(best_test_R2)] best_lr = lr_list[R2_test.index(best_test_R2)] return best_lr, best_train_R2, best_test_R2
def model_lightgbm_regressor(X_train, X_test, y_train, y_test): model_name = f'model_{count}_lightgbm_regressor' model = LGBMRegressor(), y_train) y_pred = model.predict(X_test) score = model.score(X_test, y_test) print(f'{model_name} accuracy: {score}') joblib.dump(model, f'model/{model_name}.joblib')
def lgbm_regressor(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame, y_val: np.ndarray) -> tuple: x_trn, x_val = x_trn.copy(), x_val.copy() y_trn, y_val = y_trn.copy(), y_val.copy() model = LGBMRegressor(boosting_type='gbdt', objective='regression', metric='mse', n_estimators=400, learning_rate=0.05, min_child_samples=3, num_iterations=700, n_jobs=-1, random_state=7) _ =, y_trn) training_score = model.score(x_trn, y_trn) validation_score = model.score(x_val, y_val) return model, training_score, validation_score
def LGBR(train, bindingEnergy_train, test, bindingEnergy_test, best_lr, cpus): ''' LightGradientBoostingRegressor algorithm. ''' train_light = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) test_light = test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) lgb_reg = LGBMRegressor(n_estimators=10000, learning_rate=best_lr, max_depth=15, random_state=0, n_jobs=cpus), bindingEnergy_train) predictions = lgb_reg.predict(test_light) R2_test = lgb_reg.score(test_light, bindingEnergy_test) MSE = mean_squared_error(bindingEnergy_test, predictions) MAE = mean_absolute_error(bindingEnergy_test, predictions) return predictions, R2_test, MSE, MAE
def treinaML(df): x = df[[ 'feriado', 'dia', 'mes', 'ano', 'diaAno', 'diaSemana', 'diaUtil', 'segDia5', 'segDia10', 'diaDeProducao', 'seg', 'diaUtil5', 'diaUtil10', 'inicioSemana', 'semanaAno', 'inicioMes' ]] y = df['qtd'] SEED = 5 np.random.seed(SEED) x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.30, random_state=SEED) print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(x_treino), len(x_teste))) lgb_model = LGBMRegressor(), y_treino) print('R² = {}'.format(lgb_model.score(x_treino, y_treino).round(3))) y_previsto = lgb_model.predict(x_teste) print('R² = %s' % metrics.r2_score(y_teste, y_previsto).round(3)) return lgb_model
le = preprocessing.LabelEncoder() X[cat_cols] = X[cat_cols].apply(lambda col: le.fit_transform(col.astype(str))) cat_cols = df_test_external.dtypes==object cat_cols = df_test_external.columns[cat_cols].tolist() le = preprocessing.LabelEncoder() df_test_external[cat_cols] = df_test_external[cat_cols].apply(lambda col: le.fit_transform(col.astype(str))) gbm=LGBMRegressor(objective='regression',learning_rate=0.05, n_estimators=300) # train,Y) print('Accuracy of gbm regression on training set: {:.2f}' .format(gbm.score(X, Y))) Y_pred_gbm=gbm.predict(df_test_external) gbmResult = {'Id':Test_T_ID_external, 'SalePrice':Y_pred_gbm} df_gbmResult = pd.DataFrame(gbmResult) df_gbmResult.head() df_gbmResult.shape df_gbmResult.to_csv('submissionLgbm_external.csv',index=False) """Describe the dataset and whether this data helps with prediction. The dataset is an extension of the Ames Housing dataset which was compiled by Dean De C**k. Refernce: Source:
# 模型评估 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5) # 特征重要度 print('Feature importances:', list(gbm.feature_importances_)) # 网格搜索,参数优化 estimator = LGBMRegressor(num_leaves=31) param_grid = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]} gbm = GridSearchCV(estimator, param_grid), y_train) print('Best parameters found by grid search are:', gbm.best_params_) # regressor = LGBMRegressor() gbm_score = gbm.score(X_test, y_test) print('准确率:', gbm_score) #添加标签画图 plt.figure() plt.plot(range(len(y_pred)), y_pred, 'red', linewidth=2.5, label="predict data") plt.plot(range(len(y_test)), y_test, 'green', label="test data") plt.figure() y = y_pred - y_test plt.plot(y)
model = gridsearch.best_estimator_ score = gridsearch.best_score_ rmse_scores = (-score) #for item in grid.grid_scores_: #print ("\t%s %s %s" % ('\tGRIDSCORES\t', "R" , item)) #print ('%s\tHP\t%s\t%f' % ("R" , str(best_params) ,abs(score))) print(best_params) print(rmse_scores) print(model) from sklearn.metrics import mean_squared_error lgbm = LGBMRegressor(random_state=0,learning_rate= 0.1, max_depth= 4, n_estimators = 100, num_leaves= 30,min_data_in_leaf=10,max_bin = 100,lambda_l1 = 0.001,lambda_l2 = 0.001,feature_fraction = 0.8,bagging_fraction=0.6),y_pp_train) print("Accuracy on training set: {:.3f}".format(lgbm.score(x_pp_train, y_pp_train))) print("Accuracy on test set: {:.3f}".format(lgbm.score(x_pp_test, y_pp_test))) y_pred = lgbm.predict(x_pp_test) y_pred_train = lgbm.predict(x_pp_train) print("RMSE on train :{:.3f}".format(mean_squared_error(y_pp_train, y_pred_train, squared=False))) print("RMSE on test :{:.3f}".format(mean_squared_error(y_pp_test, y_pred, squared=False))) #Seeing the Feature Importance by Mean Decrease in Impurity (MDI) features = x_pp_train.columns importances = lgbm.feature_importances_ indices = np.argsort(importances) plt.figure(figsize=(20,100)) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance')
pca = PCA(n_components=1, whiten=True, random_state=60).fit(y_train) y_train_pca = pca.transform(y_train) y_test_pca = pca.transform(y_test) y_train_pca = y_train_pca.reshape(8000, ) y_test_pca = y_test_pca.reshape(2000, ) model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=5, colsample_bytree=0.7, colsample_bylevel=0.7), y_train_pca) score = model.score(x_test, y_test_pca) print("R2:", score) # thresholds = np.sort(model.feature_importances_) # 오름차순 정렬(feature_importances정렬) # print(thresholds) # models=[] # res = np.array([]) # for thresh in thresholds: # selection = SelectFromModel(model, threshold=thresh, prefit=True) # select_x_train = selection.transform(x_train) # select_x_test = selection.transform(x_test) # model2 = LGBMRegressor(n_estimators=500, learning_rate=0.1, n_jobs=-1) #, y_train_pca, verbose=False, eval_metric=['logloss','rmse'],
lgbm = LGBMRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1), y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = lgbm.predict(x_test) r2 = r2_score(y_test, y_pre) score = lgbm.score(x_test, y_test) print(__file__) print("r2") print(r2) print("score") print(score) #6)selectFromModel thresholds = np.sort(lgbm.feature_importances_) idx_max = -1 max = r2 for idx, thresh in enumerate(thresholds): #데이터 전처리
### 데이터 ### x, y = load_boston(return_X_y=True) print(x.shape) # (506, 13) print(y.shape) # (506, ) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) ### 기본 모델 ### model = LGBMRegressor(n_estimators=300, learning_rate=0.1, n_jobs=-1), y_train) score = model.score(x_test, y_test) print('R2 :', score) #== Default R2 : 0.9313126937746082 ==# ### feature engineering ### thresholds = np.sort(model.feature_importances_) print(thresholds) models = [] res = np.array([]) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train)
from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.metrics import r2_score import matplotlib.pyplot as plt import pickle dataset = load_boston() x = y = x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) model = LGBMRegressor(), y_train) score = model.score(x_test, y_test) # print(score) thresholds = np.sort(model.feature_importances_) # print(thresholds) models = [] # 빈 모델 배열 생성 res = np.array([]) #빈 결과값 배열 생성 for thres in thresholds: selection = SelectFromModel(model, threshold=thres, prefit=True) #중요하지 않는 컬럼부터 하나씩 빼면서 트레이닝한다 #median selection_x_train = selection.transform(x_train) model2 = LGBMRegressor(n_estimators=1000) selection_x_test = selection.transform(x_test),
'n_estimators': range(100, 300, 50), 'eta': [0.1, 0.2], 'max_depth': range(3, 10, 1), 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0] } rand_search_xgb = RandomizedSearchCV(estimator=xgbreg, param_distributions=rand_param_xgb, verbose=1, n_jobs=-1, n_iter=200, cv=8), y_train) best_param = rand_search_xgb.best_params_ best_param xgbreg = XGBRegressor(subsample=0.5, n_estimators=150, max_depth=4, eta=0.1), y_train) xgbreg.score(X_test, y_test) #Light GBM REgressor lgbm = LGBMRegressor(), y_train) lgbm.score(X_test, y_test) lgbm.score(X_train, y_train) X_train.columns #Saving models joblib.dump(ranreg, '') joblib.dump(xgbreg, '') joblib.dump(lgbm, '')
#score_rfr = rfr.score(X_test, y_test), y_train) preds = gbr.predict(X_test) preds_test_gbr = gbr.predict(X_test_sub) mae_gbr = mean_absolute_error(y_test, preds) rmse_gbr = np.sqrt(mean_squared_error(y_test, preds)) score_gbr = gbr.score(X_test, y_test) cv_gbr = mean_cross_val(gbr, X_train1, y), y_train) preds = lgbm.predict(X_test) preds_test_lgbm = lgbm.predict(X_test_sub) mae_lgbm = mean_absolute_error(y_test, preds) rmse_lgbm = np.sqrt(mean_squared_error(y_test, preds)) score_lgbm = lgbm.score(X_test, y_test) cv_lgbm = mean_cross_val(lgbm, X_train1, y) """, y_train) preds = xgb.predict(X_test) preds_test_xgb = xgb.predict(X_test_sub) mae_xgb = mean_absolute_error(y_test, preds) rmse_xgb = np.sqrt(mean_squared_error(y_test, preds)) score_xgb = xgb.score(X_test, y_test) cv_xgb = mean_cross_val(xgb, X_train1, y) """, y_train) preds = cb.predict(X_test) preds_test_cb = cb.predict(X_test_sub) mae_cb = mean_absolute_error(y_test, preds)
class LGBM: def __init__(self, params): = "lgbm" # Fix this learning_rate = params['learning_rate'] n_estimators = params['n_estimators'] min_data_in_leaf = params['num_leaves'] # key params num_leaves = params['num_leaves'] min_gain_to_split = params['min_gain_to_split'] max_depth = params['max_depth'] # speed vs accuracy tradeoffs bagging_freq = params['bagging_freq'] bagging_frac = params['bagging_fraction'] feature_frac = params['feature_fraction'] # Regularisation reg_alpha = params['reg_alpha'] reg_lambda = params['reg_lambda'] n_jobs = params['n_jobs']#3 # -1 boosting_type = params['boosting_type'] #'gbdt' #["dart", 'gbdt', 'goss', 'rf'] self.model = LGBMRegressor(learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, max_depth=max_depth, min_split_gain=min_gain_to_split, bagging_fraction=bagging_frac, bagging_freq=bagging_freq, feature_frac=feature_frac, reg_alpha=reg_alpha, reg_lambda=reg_lambda, n_jobs=n_jobs, boosting_type=boosting_type ) self.target_col = None def _split_data_maps(self, data_map, split_fraction): train = {} test = {} order = sorted(list(k for k in data_map.keys())) length = len(data_map[order[0]]) splitpoint = int(length*split_fraction) for k in order: train[k] = data_map[k].iloc[:splitpoint] test[k] = data_map[k].iloc[splitpoint:] return train, test def _format_data(self, data_map): if self.target_col is None: raise ValueError("Target col is None!") order = sorted(list(k for k in data_map.keys() if k != self.target_col)) inputs = [] num_stocks = data_map[order[0]].shape[-1] for i in range(num_stocks): stock_data = [] for k in order: arr = data_map[k].iloc[:, i] stock_data.append(arr.values.reshape(-1, 1)) inputs.append(np.concatenate(stock_data, axis=1)) inputs = np.concatenate(inputs, axis=0) return inputs def _format_target(self, data_map): if self.target_col is None: raise ValueError("Target col is None!") target = data_map[self.target_col].values return target.reshape(-1, 1) # stacked targets def fit(self, data_map, target_col, valid_fraction=0.2, rs_iterations=-1): print("Formatting data") self.target_col = target_col print("Splitting data map") train_map, valid_map = self._split_data_maps(data_map, valid_fraction) y = self._format_target(train_map) X = self._format_data(train_map) print("Fitting", if rs_iterations > 0: param_dist = self.get_hyperparam_ranges() param_combinations =[len(param_dist[k]) for k in param_dist]) rs_iterations = min(param_combinations, rs_iterations) print("Running {} iterations of random search".format( rs_iterations)) self.model = select.RandomizedSearchCV(self.model, param_distributions=param_dist, n_iter=rs_iterations, cv=3, n_jobs=2), y) if valid_fraction != 0: y_valid = self._format_target(valid_map) X_valid = self._format_data(valid_map) print("Scoring on validation data") r2 = self.model.score(X_valid, y_valid) print("R2 for {}:".format(, r2) return r2 else: print("No validation data") return 0.0 def predict(self, data_map): X = self._format_data(data_map) return self.model.predict(X) def get_save_name(self, model_folder): return os.path.join(model_folder,".joblib") def save(self, model_folder): name = self.get_save_name(model_folder) joblib.dump(self.model, name) def load(self, model_folder): name = self.get_save_name(model_folder) self.model = joblib.load(name) @classmethod def get_hyperparam_ranges(cls): param_grid = { 'max_depth': [-1], 'min_data_in_leaf': [20, 40, 80], 'num_leaves': [8, 16, 32, 64, 128], 'learning_rate': [1.0, 0.1, 0.05, 0.01], 'n_estimators': [50, 100, 200], 'feature_fraction': [0.2, 0.4, 0.6, 0.8], 'bagging_freq': [0], # disables 'bagging_fraction': [1.0], # disables 'reg_alpha': [0.0, 1.0, 0.1,0.01], 'reg_lambda': [0.0,1.0, 0.1,0.01], 'min_gain_to_split':[0.001], 'n_jobs':[2], 'boosting_type':['gbdt', 'dart'] } return param_grid
'subsample': [0.7] }] settings = { 'verbose': False, 'eval_set': [(x_train, y_train), (x_test, y_test)] } kfold = KFold(n_splits=5, shuffle=True, random_state=66) # 모델 컬럼별 4번 for i in range(4): model = LGBMRegressor() settings['eval_set'] = [(x_train, y_train[:, i]), (x_test, y_test[:, i])], y_train[:, i], **settings) y_test_pred = model.predict(x_test) score = model.score(x_test, y_test[:, i]) mae = MAE(y_test[:, i], y_test_pred) print("r2 : ", score) print("mae :", mae) thresholds = np.sort(model.feature_importances_)[[ i for i in range(0, len(model.feature_importances_), 20) ]] print("model.feature_importances_ : ", model.feature_importances_) print(thresholds) best_mae = mae best_model = model best_y_pred = model.predict(x_pred) best_y_test_pred = y_test_pred print(best_y_pred.shape) for thresh in thresholds: if (thresh == 0): continue
def train(self, model='lr'): """ lr score 0.80048125 :return: """ train_x, train_y, test = self.load_train_x_train_y_test_x() train_xx, test_xx, train_yy, test_yy = train_test_split(train_x, train_y, train_size=0.8) if model == 'lr': lr = LogisticRegression(penalty='l2', solver='liblinear', C=1, verbose=1), train_yy.reshape(-1, )) score = lr.score(test_xx, test_yy) joblib.dump(lr, os.path.join(daikuan_path, 'lr_model_time_{}_score_{}'.format(int(time.time()), score))) print('lr score', score) # lr = joblib.load(os.path.join(daikuan_path, 'lr_model_time_1599821345_score_0.6554')) # # r = lr.predict(test) # with open(os.path.join(daikuan_path, 'samples.csv'), mode='w') as f: # f.write('id,isDefault\n') # for idx, y in enumerate(r): # print('{},{}'.format(idx + 800000, y)) # f.write('{},{}\n'.format(idx + 800000, y)) # r = lr.predict(test_xx) # for x, y in zip(r, test_yy): # print(x, y) # print(lr.score(test_xx, test_yy)) elif model == 'svm': # linear svc = SVC(C=1, kernel='rbf', verbose=True, max_iter=100), train_yy.reshape(-1, )) score = svc.score(test_xx, test_yy) joblib.dump(svc, os.path.join(daikuan_path, 'svc_model_time_{}_score_{}'.format(int(time.time()), score))) print('svm score', score) # svc = joblib.load(os.path.join(daikuan_path, 'svc_model_time_1599796746_score_0.5245')) # r = svc.predict(test_xx) elif model == 'ada': ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), n_estimators=400, random_state=7), train_yy.reshape(-1, )) score = ada.score(test_xx, test_yy) joblib.dump(ada, os.path.join(daikuan_path, 'ada_model_time_{}_score_{}'.format(int(time.time()), score))) print('ada score', score) elif model == 'rf': pass elif model == 'gbm': lgbm = LGBMRegressor(num_leaves=30 , max_depth=5 , learning_rate=.02 , n_estimators=1000 , subsample_for_bin=5000 , min_child_samples=200 , colsample_bytree=.2 , reg_alpha=.1 , reg_lambda=.1), train_yy) score = lgbm.score(test_xx, test_yy) print('lgbm score', score) elif model == 'gbdt': # 调参 # param_test1 = {'n_estimators': range(128, 256, 32)} param_test2 = {'max_depth': range(3, 14, 2), 'min_samples_split': range(100, 801, 200)} g_search = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300, min_samples_leaf=20, max_features='sqrt', subsample=0.8, random_state=10, verbose=1, n_estimators=192), param_grid=param_test2, scoring='roc_auc', iid=False, cv=5, verbose=1) #, train_yy.reshape(-1, )) # print(g_search.best_params_) # print(g_search.best_score_) gbdt = GradientBoostingClassifier(n_estimators=192, learning_rate=0.1, min_samples_split=300, min_samples_leaf=20, verbose=1), train_yy.reshape(-1, )) # gbdt = joblib.load(os.path.join(daikuan_path, 'gbdt_model_time_1599825531_score_0.6515666666666666')) y_pred = gbdt.predict(test_xx) y_predprob = gbdt.predict_proba(test_xx)[:, 1] print('accuracy', metrics.accuracy_score(test_yy, y_pred)) print('AUC', metrics.roc_auc_score(test_yy, y_predprob)) score = gbdt.score(test_xx, test_yy) joblib.dump(gbdt, os.path.join(daikuan_path, 'gbdt_model_time_{}_score_{}'.format(int(time.time()), score))) print('gbdt score', score)
# score1 : 72.2789 # mae1 : 1.1567 # score2 : 22.8805 # mae2 : 0.6918 # score3 : 26.0999 # mae3 : 2.0683 # score4 : 17.5051 # mae4 : 1.3463, y_train1, verbose=False, eval_metric=['logloss'], eval_set=[(x_test, y_test1)], early_stopping_rounds=20) score1 = model.score(x_test, y_test1) print("score1 : %.4f" % (score1 * 100.0)) # print(model.feature_importances_) y_pred_1 = model.predict(x_test) mae1 = mean_absolute_error(y_test1, y_pred_1) print('mae1 : %.4f' % (mae1)) y_pred1 = model.predict(x_pred), y_train2, verbose=False, eval_metric=['logloss'], eval_set=[(x_test, y_test2)], early_stopping_rounds=20) score2 = model.score(x_test, y_test2) print("score2 : %.4f" % (score2 * 100.0))
'num_leaves': [32, 48, 64, 80], 'learning_rate': [0.01, 0.05, 1] } #%% lgbm_reg = LGBMRegressor() grid_cv = GridSearchCV(lgbm_reg, param_grid=params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error'), y_train) # 평가지표를 MSE(오차제곱의 평균값)를 이용함, 이값이 낮아야 좋음 print('최적 하이퍼 파라미터:', grid_cv.best_params_) print('최고 예측 점수:', -1 * grid_cv.best_score_) #%% # 최적 파라미터 값으로 모델을 다시 수행 # 평가지표로 R2값을 사용했으며 1과 가까워야 좋음 # 제대로된 평가를 위해 np.expm1을 사용해 역로그를 취함 from lightgbm import LGBMRegressor lgbm_reg1 = LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=18, num_leaves=48), y_train) lgbm_reg1.score(X_test, y_test) #%% import pickle lgbmFile = open('lgbm_reg1.pckl', 'wb') pickle.dump(lgbm_reg1, lgbmFile) lgbmFile.close()
y_pred = best_model.predict(best_x_test) r2 = r2_score(y_test, y_pred) print('r2 :', r2) end1 = time.time() import joblib joblib.dump(best_model, './model/xgb_Save/sfm1-' + str(best_score) + '.dat') model2 = joblib.load('./model/xgb_Save/sfm1-' + str(best_score) + '.dat') #### LGBM 셀렉트 start2 = time.time() model_LGBM = LGBMRegressor(), y_train) score = model_LGBM.score(x_test, y_test) print("r2 : ", score) thresholds = np.sort(model_LGBM.feature_importances_) print(thresholds) print(x_train.shape) print("========================") best_x_train = x_train best_x_train = x_test best_score = score best_model = model_LGBM for thresh in thresholds: selection = SelectFromModel(model_LGBM, threshold=thresh, prefit=True)