def lgb_model(num_boost_round=200): train,test,y_train,y_test = get_data_and_split_train_test() print('shape of train set=',train.shape) params_lgb['metric']=['binary_logloss','auc'] params_lgb['max_depth']=15 params_lgb['feature_fraction'] = .8 import lightgbm as lgb s=time() print('shape of train set=',train.shape) evals_result={} lgb_train=lgb.Dataset(train,label=y_train) lgb_test=lgb.Dataset(test,label=y_test) model=lgb.train(params=params_lgb,train_set=lgb_train, num_boost_round=num_boost_round, valid_sets=[lgb_train,lgb_test], evals_result=evals_result, verbose_eval=num_boost_round//3) evaluate_model(model, train, test, y_train, y_test) predict_fill_sample(model, file='LGB_model') lgb.plot_metric(evals_result,'auc') plt.show() lgb.plot_importance(model,max_num_features=40) plt.show() print('-'*80)
def train(train_x,train_y,test_x,res,show_importance=True): clf = lgb.LGBMClassifier( boosting_type="gbdt",num_leaves=31,reg_alpha=0.0,reg_lambda=1, max_depth=-1,n_estimators=10,objective="binary", subsample=0.7,colsample_bytree=0.7,subsample_freq=1, learning_rate=0.05,min_child_weight=50,random_state=1024,n_jobs=-1 ) clf.fit(train_x,train_y,eval_set=[(train_x,train_y)],eval_metric="auc",early_stopping_rounds=100) return if show_importance: lgb.plot_importance(clf,max_num_features=10) plt.title("Feature Importances") plt.savefig("feature_importance.png") booster = clf.booster_ importance = booster.feature_importance(importance_type="split") feature_name = booster.feature_name() feature_importance = pd.DataFrame({"feature_name":feature_name,"importance":importance} ) feature_importance.to_csv("feature_importance.csv",index=False) plt.close() lgb.plot_metric(clf.evals_result_,metric="auc") plt.savefig("metrics.png") res["score"] = clf.predict_proba(test_x)[:,1] res["score"] = res["score"].apply(lambda x: float("%.6f" % x)) res.to_csv("./res.csv", index=False) try: clf.booster_.save_model("lgb_classifier.txt") except Exception as e: print(str(e)) pass
def test_plot_metrics(self): test_data = lgb.Dataset(self.X_test, self.y_test, reference=self.train_data) self.params.update({"metric": {"binary_logloss", "binary_error"}}) evals_result0 = {} gbm0 = lgb.train(self.params, self.train_data, valid_sets=[self.train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(self.params, self.train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(self.X_train, self.y_train, eval_set=[(self.X_test, self.y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def model_metrics_lgb(clf): fig3 = plt.figure(figsize=(8, 11)) gs3 = gridspec.GridSpec(2, 1) ax7 = fig3.add_subplot(gs3[0]) ax8 = fig3.add_subplot(gs3[1]) lgb.plot_metric(clf, metric="l2", ax=ax7, title="l2 during Training") lgb.plot_metric(clf, metric="huber", ax=ax8, title="Huber Loss during Training") gs3.tight_layout(fig3, rect=[0.05, 0.05, 0.95, 0.95], pad=0.5) return [fig3]
def learning_curve(self): cols = 3 if self.cfg.training.num_fold % cols != 0: rows = self.cfg.training.num_fold // cols + 1 else: rows = self.cfg.training.num_fold // cols fig = plt.figure(figsize=(25, 12)) for i in range(len(self.evals_results)): ax = fig.add_subplot(rows, cols, i + 1) lgb.plot_metric(self.evals_results[i], ax=ax) ax.set_title('Learning curve in Fold {}'.format(i + 1)) plt.tight_layout() plt.show()
def show_model_performance(gbm, evals_result): # show model importance # lgb.plot_importance(gbm) # Show Decision Tree if config.can_plot_tree: graph = lgb.create_tree_digraph(gbm, name='Decision Tree') graph.render(view=True) if config.can_show_metric: fig, axs = plt.subplots(2, 1, figsize=(8, 10)) for index in range(len(config.metric)): lgb.plot_metric(evals_result, config.metric[index], title=config.metric[index], ax=axs[index]) plt.show()
def plot_model_information(bst, validation_metrics, my_own_metrics): print('Number of trees:', bst.num_trees()) print('Plot model performance') ax = lgb.plot_metric(validation_metrics, metric='auc') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(bst, max_num_features=15) plt.show() def plot_my_own_metrics(my_own_metrics): x = list(my_own_metrics.keys()) y = list(my_own_metrics.values()) plt.barh(x, y) for index, value in enumerate(y): plt.text(value, index, str(value)) print('plot_my_own_metrics') plot_my_own_metrics(my_own_metrics) plt.show() tree_index = 0 print('Plot ' + str(tree_index) + 'th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(bst, tree_index=tree_index, figsize=(64, 36), show_info=['split_gain']) plt.show()
def lgb_train(train_data, val_data, threshold, init_model, boost_round=1000, random_seed=6, for_submit=False): print('boost round: ', boost_round) def lgb_f1_score(y_hat, data, THRESHOLD=threshold): y_true = data.get_label() y_hat = np.where(y_hat >= THRESHOLD, 1, 0) return 'f1', f1_score(y_true, y_hat), True valid_sets = [train_data] if for_submit else [train_data, val_data] params = { 'objective': 'binary', # 'early_stopping_rounds': 100, 'learning_rate': 0.01, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'max_depth': -1, 'num_leaves': 100, 'seed': random_seed, 'metrics': 'None' } eval_dict = {} clf = lgb.train(params, train_data, valid_sets=valid_sets, evals_result=eval_dict, num_boost_round=boost_round, verbose_eval=100, init_model=init_model, feval=lgb_f1_score) if for_submit: del eval_dict gc.collect() return clf else: lgb.plot_metric(eval_dict, metric='f1') res = max(eval_dict['valid_1']['f1']) del eval_dict gc.collect() return res
def test_plot_metrics(self): X_train, X_test, y_train, y_test = train_test_split( *load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) test_data = lgb.Dataset(X_test, y_test, reference=train_data) params = { "objective": "binary", "metric": {"binary_logloss", "binary_error"}, "verbose": -1, "num_leaves": 3 } evals_result0 = {} gbm0 = lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(params, train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def test_plot_example(): print('Loading data...') # load or create your dataset df_train = pd.read_csv( r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.train', header=None, sep='\t') df_test = pd.read_csv( r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.test', header=None, sep='\t') y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = {'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0} evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plotting 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def test_plot_metrics(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) test_data = lgb.Dataset(X_test, y_test, reference=train_data) params = { "objective": "binary", "metric": {"binary_logloss", "binary_error"}, "verbose": -1, "num_leaves": 3 } evals_result0 = {} gbm0 = lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(params, train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def train_predict_model(train_list_x, train_list_label, params, train_times=500): d_x = train_list_x d_y = train_list_label train_X, valid_X, train_Y, valid_Y = train_test_split(d_x, d_y, test_size=0.2, random_state=2) # 将训练集分为训练集+验证集 lgb_train = lgb.Dataset(train_X, label=train_Y) lgb_eval = lgb.Dataset(valid_X, label=valid_Y, reference=lgb_train) # select_suit_parameter(lgb_train) evals_result = {} print("Training...") bst = lgb.train( params, lgb_train, # categorical_feature=list(range(1, 82)), # 指明哪些特征的分类特征 valid_sets=[lgb_eval], num_boost_round=train_times, feval=lgb_f1_score, evals_result=evals_result # early_stopping_rounds=30 ) lgb.plot_metric(evals_result, metric='f1') return bst,lgb_train
def train(self, X_train, y, X_test=None, y_test=None, parameters=None, plot=False): self.evals_result = {} # to record eval results for plotting if parameters is not None: self.parameters = parameters dtrain = lgb.Dataset(X_train, label=y) dval = lgb.Dataset(X_test, label=y_test) self.model = lgb.train(self.parameters, dtrain, valid_sets=[dtrain, dval], evals_result=self.evals_result, verbose_eval=False, feval=accuracy) if plot: print('Plotting metrics recorded during training...') ax = lgb.plot_metric(self.evals_result, metric='accuracy') plt.show() ax = lgb.plot_metric(self.evals_result, metric='auc') plt.show()
def train_lightgbm_model(level, fold=1, params={}, model_dir='models/uncertainty/', prediction_lag=28, model_name="lightgbm", augment_events=False, verbose=True, num_boost_round=2500, early_stopping_rounds=50, verbose_eval=50): # only require lightgbm to be installed when calling this function import lightgbm as lgb # read data train, val, test, features = get_train_val_slit( level, fold, augment_events=augment_events, prediction_lag=prediction_lag) # make lgb datasets labels = ['demand'] train_set = lgb.Dataset(train[features], train[labels]) val_set = lgb.Dataset(val[features], val[labels]) # cleanup memory del train gc.collect() # perform training evals_result = {} # to record eval results for plotting model = lgb.train( params, train_set, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, valid_sets=[val_set], verbose_eval=verbose_eval, # fobj="mae",#feval = "mae", evals_result=evals_result) model.save_model( model_dir + model_name + "-level{}-lag{}-fold{}.txt".format(level, prediction_lag, fold)) ax = lgb.plot_metric(evals_result, metric='l1') plt.show() return model, evals_result, val
def train(train_x, train_y, kfold, best_params=None): params = { "objective": "binary", "boosting_type": "gbdt", "metric": {"binary_logloss"}, "num_leaves": 50, "min_data_in_leaf": 100, "learning_rate": 0.1, "feature_fraction": 0.5, } models = [] for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) tr_set = lgb.Dataset(tr_x, tr_y) val_set = lgb.Dataset(val_x, val_y, reference=tr_set) evals_result = {} model = lgb.train( params=params, train_set=tr_set, valid_sets=[val_set, tr_set], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=1, evals_result=evals_result, feval=accuracy, ) importance = pd.DataFrame(model.feature_importance(), index=train_x.columns, columns=["importance" ]).sort_values("importance", ascending=[False]) # print(f"######################importance#####################") # print(importance.head(50)) # 検証結果の描画 fig = lgb.plot_metric(evals_result) plt.savefig(f"{DATA_DIR}/learning_curve_{i+1}.png") models.append(model) return models
def bo_lgb_train(opt, x_train, y_train, x_test, y_test): num_train, num_feature = x_train.shape lgb_train = lgb.Dataset(x_train, y_train) lgb_eval = lgb.Dataset(x_test, y_test) evals_result = {} params = { 'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'num_boosting_round': 300, 'n_jobs': 2 } params.update(opt.max['params']) params['num_leaves'] = int(round(params['num_leaves'])) params['max_depth'] = int(round(params['max_depth'])) feature_name = ['f' + str(i + 1) for i in range(num_feature)] print('Start training...') model = lgb.train( params, lgb_train, num_boost_round= 300, # This number could be changed for iteration times for lightgbm training valid_sets=[lgb_train, lgb_eval], feature_name=feature_name, evals_result=evals_result, #fobj=loglikelihood, feval=lgb_f1_score, verbose_eval=10) model.save_model('model.txt', num_iteration=model.best_iteration) print('Plot metrics recorded during training...') #lightgbm could show f1 score figure or accuracy figure ax = lgb.plot_metric(evals_result, metric='f1') #ax = lgb.plot_metric(evals_result, metric='accuracy') plt.show() return model
def train_light_gbm(self, dts): # create dataset for lightgbm lgb_train = lgb.Dataset(dts.trainX, dts.trainY) lgb_test = lgb.Dataset(dts.testX, dts.testY, reference=lgb_train) # specify your configurations as a dict params = { 'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0 } evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['close', 'open', 'high', 'low', 'volume'], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plotting 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def get_model_train_result(evals_result, model_name="default", outputpath="./"): ''' 画出训练结果函数 :param evals_result: :param model_name: :param outputpath: :return: ''' try: outputpath = outputpath + model_name + "_train_result.png" ax = lgb.plot_metric(evals_result, metric='binary_logloss', figsize=(20, 13)) plt.savefig(outputpath) except: logger.error("create model train result fail.") #raise RuntimeError("create model train result fail.") return False else: logger.info("create model train result sucess.") return True
def train_model(model_name, X_train, y_train): kf = KFold(config.k_folds) cv_scores = [] for i, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)): print('FOLD {} \n'.format(i)) X_tr, y_tr = X_train.loc[tr_idx], y_train[tr_idx] X_vl, y_vl = X_train.loc[vl_idx], y_train[vl_idx] if model_name == 'lgb': model = model_lgb() model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_vl, y_vl)], \ eval_metric='auc', verbose=config.verbose, early_stopping_rounds=config.stop_rounds) with open('lgb_model_{}.pkl'.format(i), 'wb') as handle: pickle.dump(model, handle) #code to visualize feature importance ax = lgb.plot_importance(model, max_num_features=100, figsize=(15, 15)) # ax2 = lgb.plot_tree(model,figsize=(15,15)) ax3 = lgb.plot_metric(model, figsize=(15, 15)) plt.show() pred_y_val = model.predict(X_vl) score = mean_squared_error(pred_y_val, y_vl) cv_scores.append(score) print(np.mean(cv_scores)) del model, X_tr, X_vl gc.collect() if model_name == 'rf': model = model_rf() model.fit(X_tr, y_tr) with open('rf_model_{}.pkl'.format(i), 'wb') as handle: pickle.dump(model, handle) del model, X_tr, X_vl gc.collect()
def test_plot_metrics(params, breast_cancer_split, train_data): X_train, X_test, y_train, y_test = breast_cancer_split test_data = lgb.Dataset(X_test, y_test, reference=train_data) params.update({"metric": {"binary_logloss", "binary_error"}}) evals_result0 = {} lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) assert isinstance(ax0, matplotlib.axes.Axes) assert ax0.get_title() == 'Metric during training' assert ax0.get_xlabel() == 'Iterations' assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'} ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} lgb.train(params, train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) with pytest.raises(ValueError): lgb.plot_metric(evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) assert isinstance(ax2, matplotlib.axes.Axes) assert ax2.get_title() == '' assert ax2.get_xlabel() == '' assert ax2.get_ylabel() == ''
def Test_XGB(dump_path, dtest, dtrain, dval, evals_result): loaded_bst = xgb.Booster() loaded_bst.load_model(dump_path + "/best.model") y_pred_test = loaded_bst.predict(dtest) y_pred_eval = loaded_bst.predict(dtrain) y_pred_train = loaded_bst.predict(dtrain) y_pred_val = loaded_bst.predict(dval) y_pred_test = loaded_bst.predict(dtest) predictions_train = [round(value) for value in y_pred_train] predictions_val = [round(value) for value in y_pred_val] predictions_test = [round(value) for value in y_pred_test] accuracy_train = accuracy_score(dtrain.get_label(), predictions_train) * 100 accuracy_val = accuracy_score(dval.get_label(), predictions_val) * 100 accuracy_test = accuracy_score(dtest.get_label(), predictions_test) * 100 rmse_train, rmse_test = mean_squared_error( dtrain.get_label(), y_pred_train)**0.5, mean_squared_error(dtest.get_label(), y_pred_test)**0.5 rmse_val = mean_squared_error(dval.get_label(), y_pred_val)**0.5 roc_train, roc_val = roc_auc_score(dtrain.get_label(), y_pred_train), roc_auc_score( dval.get_label(), y_pred_val) roc_test = roc_auc_score(dtest.get_label(), y_pred_test) tests = [ accuracy_train, accuracy_val, accuracy_test, rmse_train, rmse_val, rmse_test, roc_train, roc_val, roc_test ] testing_labels = [ 'training accuracy', 'dev accuracy', 'test accuracy', 'train rmse', 'val rmse', 'test rmse', 'train roc', 'dev roc', 'test roc' ] with open(dump_path + '/metrics.txt', 'w') as writer: writer.write('XGB metrics...\n' + '-' * 10 + '\n') for i in range(len(tests)): writer.write(testing_labels[i] + ": " + str(tests[i]) + "\n") ptool.my_plot_importance(loaded_bst, figsize=(7, 7), title='XGB Feature importance', path=dump_path) class_labels = ('neutron', 'electron') ax = lgb.plot_metric(evals_result, metric='logloss', figsize=(12, 12)) ax.legend() plt.ylabel('logloss classification error') plt.title('XGBoost Log Loss') plt.savefig(dump_path + '/log loss classification error', bbox_inches='tight') ptool.plot_confusion_matrix(dump_path=dump_path + "/", classes=class_labels, model="XGB", pred=predictions_test, labels=dtest.get_label()) return
evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') plt.show() print('Plotting 54th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
def main(): """ << 処理の流れ >> データ読み込み ⇒ 投球データと選手データの結合(train,testも結合) ⇒ nanの置換 ⇒ カテゴリ変数の変換 ⇒ RFEによる特徴量選択(個数の最適化) ⇒ ハイパーパラメータの最適化 ⇒ 交差検証 """ train_pitch = pd.read_csv(TRAIN_PITCH_PATH) train_player = pd.read_csv(TRAIN_PLAYER_PATH) test_pitch = pd.read_csv(TEST_PITCH_PATH) test_player = pd.read_csv(TEST_PLAYER_PATH) pitching_type_2016 = pd.read_csv(EXTERNAL_1_PATH) pitching_type_2017 = pd.read_csv(EXTERNAL_2_PATH) pitching_type_2018 = pd.read_csv(EXTERNAL_3_PATH) train_pitch["use"] = "train" test_pitch["use"] = "test" test_pitch["球種"] = 0 pitch_data = pd.concat([train_pitch, test_pitch], axis=0).drop(PITCH_REMOVAL_COLUMNS, axis=1) player_data = pd.concat([train_player, test_player], axis=0).drop(PLAYER_REMOVAL_COLUMNS, axis=1) # .fillna(0) pitchers_data = train_player[train_player["位置"] == "投手"].drop( PLAYER_REMOVAL_COLUMNS, axis=1) pitching_type_ratio = pd.concat( [pitching_type_2016, pitching_type_2017, pitching_type_2018], axis=0).reset_index(drop=True) merged = (pd.merge( pitch_data, player_data, how="left", left_on=["年度", "投手ID"], right_on=["年度", "選手ID"], ).drop(["選手ID", "投球位置区域"], axis=1).fillna(0)) merged = merged.rename(columns={"選手名": "投手名", "チーム名": "投手チーム名"}) # データセットと前年度投球球種割合をmergeする merged = pd.merge( merged, pitching_type_ratio, how="left", left_on=["年度", "投手ID", "投手名"], right_on=["年度", "選手ID", "選手名"], ).drop(["選手ID", "選手名"], axis=1) use = merged.loc[:, "use"] merged = merged.drop(["use", "位置", "年度"], axis=1) # category_encodersによってカテゴリ変数をencordingする categorical_columns = [ c for c in merged.columns if merged[c].dtype == "object" ] ce_oe = ce.OrdinalEncoder(cols=categorical_columns, handle_unknown="impute") encorded_data = ce_oe.fit_transform(merged) encorded_data = pd.concat([encorded_data, use], axis=1) train = (encorded_data[encorded_data["use"] == "train"].drop( "use", axis=1).reset_index(drop=True)) test = (encorded_data[encorded_data["use"] == "test"].drop( "use", axis=1).reset_index(drop=True)) train_x = train.drop("球種", axis=1) train_y = train.loc[:, "球種"] test_x = test.drop("球種", axis=1) label_counts = train_y.value_counts() sm = SMOTE( ratio={ 0: sum(train_y == 0), 1: sum(train_y == 1) * 3, 2: sum(train_y == 2), 3: sum(train_y == 3) * 2, 4: sum(train_y == 4) * 2, 5: sum(train_y == 5) * 4, 6: sum(train_y == 6) * 20, 7: sum(train_y == 7) * 4, }) train_x_resampled, train_y_resampled = sm.fit_sample(train_x, train_y) train_x_resampled = pd.DataFrame(train_x_resampled, columns=train_x.columns) train_y_resampled = pd.Series(train_y_resampled, name="球種") # f = partial(objective, train_x, train_y) # 目的関数に引数を固定しておく # study = optuna.create_study(direction='maximize') # Optuna で取り出す特徴量の数を最適化する # study.optimize(f, n_trials=10) # 試行回数を決定する # print('params:', study.best_params)# 発見したパラメータを出力する # best_feature_count = study.best_params['n_components'] best_feature_count = 47 # x_pca, train_y = get_important_features(train_x, train_y, best_feature_count) n_splits = 10 num_class = 8 # best_params = get_best_params(x_pca, train_y, num_class) # 最適ハイパーパラメータの探索 best_params = { "lambda_l1": 5.96, "lambda_l2": 1.1, "num_leaves": 12, "feature_fraction": 0.75, "bagging_fraction": 0.89, "bagging_freq": 7, "min_data_in_leaf": 200, } submission = np.zeros((len(test_x), num_class)) accs = {} tscv = TimeSeriesSplit(n_splits=n_splits) for i, (tr_idx, val_idx) in enumerate(tscv.split(train_x_resampled)): tr_x = train_x_resampled.iloc[tr_idx].reset_index(drop=True) tr_y = train_y_resampled.iloc[tr_idx].reset_index(drop=True) val_x = train_x_resampled.iloc[val_idx].reset_index(drop=True) val_y = train_y_resampled.iloc[val_idx].reset_index(drop=True) tr_dataset = lgb.Dataset(tr_x, tr_y, free_raw_data=False) val_dataset = lgb.Dataset(val_x, val_y, reference=tr_dataset, free_raw_data=False) model, evals_result = get_model(tr_dataset, val_dataset, num_class, best_params, train_x_resampled.columns) # 学習曲線の描画 fig = lgb.plot_metric(evals_result, metric="multi_logloss") plt.savefig(f"{DATA_DIR}/learning_curve_{i}.png") y_pred = np.argmax(model.predict(val_x), axis=1) # 0~8の確率 acc = accuracy_score(val_y, y_pred) accs[i] = acc print("#################################") print(f"accuracy: {acc}") print("#################################") y_preda = model.predict(test_x, num_iteration=model.best_iteration) # 0~8の確率 submission += y_preda submission_df = pd.DataFrame(submission / n_splits) print("#################################") print(submission_df) print(best_params) print(accs) print("#################################") submission_df.to_csv(f"{DATA_DIR}/my_submission35.csv", header=False)
"feature_fraction": 0.4, "bagging_fraction": 0.6, "bagging_freq": 17, "num_threads": 16, } f_evals_result = {} f_model = lgb.train( params, f_train, valid_sets=[f_valid], num_boost_round=10000, verbose_eval=1000, early_stopping_rounds=1000, evals_result=f_evals_result, ) lgb.plot_metric(f_evals_result) # 收集当前fold的模型在vaild和test上的结果 s_y_valid[f_index_valid] += f_model.predict(f_x_valid) s_y_test += f_model.predict(x_test) / 5 gc.collect() # 收集当前seed的模型在vaild和test上的结果 y_valid += s_y_valid / len(seeds) y_pred += s_y_test / len(seeds) print("logloss", log_loss(pd.get_dummies(y_train).values, s_y_valid)) print("ac", accuracy_score(y_train, np.argmax(s_y_valid, axis=1))) # 收集全部模型在vaild和test上的结果 print("logloss", log_loss(pd.get_dummies(y_train).values, y_valid)) print("ac", accuracy_score(y_train, np.argmax(y_valid, axis=1)))
lgb_eval = lgb.Dataset(va_x, va_y) # 学習の実行 model = lgb.LGBMRegressor(objective='rmse', early_stopping_rounds=50) model.fit(tr_x, tr_y, eval_set=[(va_x, va_y), (tr_x, tr_y)], verbose=10) # バリデーションデータでのスコアの確認 va_pred = model.predict(va_x) score = np.sqrt(mse(va_y, va_pred)) score_list.append(score) score_ave = np.mean(score_list) print(f'RMSE: {score_ave:.4f}') # 学習曲線 lgb.plot_metric(model, metric='rmse') # 提出用データ tr_x = train_x tr_y = train_y ts_x = test_x """ # 変数をループしてtarget encoding for c in tr_x.columns: if tr_x[c].dtype == 'object': # 学習データ全体で各カテゴリにおけるtargetの平均を計算 data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y}) target_mean = data_tmp.groupby(c)['target'].mean() # バリデーションデータのカテゴリを置換 ts_x.loc[:, c] = ts_x[c].map(target_mean)
# #grid4 = GridSearchCV(xgb.XGBRegressor(),hyper_params,n_jobs=-1,verbose=10,cv=fold1) # #grid4.fit(fine_tune_train,y) # #reg = xgb.XGBRegressor(max_depth = grid4.best_params_['max_depth'], # min_child_weight = grid4.best_params_['min_child_weight'], # learning_rate = grid4.best_params_['learning_rate'], colsample_bylevel = 0.8, # subsample = 0.75, reg_lambda = 2, nthread = -1, # booster = 'gbtree', silent = 1, gamma = 0) # #reg.fit(fine_tune_train,y) # #y_pred = reg.predict(fine_tune_test) plot_metric(evals_result1, metric='rmse') plot_metric(evals_result, metric='rmse') xgb.plot_importance(mdl) lgb.plot_importance(model) train_size = np.linspace(0.1, 1.0, 20) plt.figure() plt.title("Learning Curve for SVM") plt.xlabel("Traning Set Size") plt.ylabel("Error") train_sizes, train_scores, test_scores = learning_curve( svm_model, X_train, y_train,
'bagging_seed': 0, 'feature_fraction': 0.2319, 'feature_fraction_seed': 0, } evals_results = {} bst = lgb.train(lgb_params, xgtrain, valid_sets=[xgtrain, xgvalid], evals_result=evals_results, early_stopping_rounds=100, verbose_eval=0, feval=None) print('Plot metrics during training...') ax = lgb.plot_metric(evals_results, metric='l2') plt.show() ax = lgb.plot_importance(bst, max_num_features=100) plt.show() gain = bst.feature_importance('gain') ft = pd.DataFrame({ 'feature': bst.feature_name(), 'split': bst.feature_importance('split'), 'gain': 100 * gain / gain.sum() }).sort_values('split', ascending=False) print(ft.head(100)) stacked_train_pred = np.expm1(bst.predict(df_train[features].values)) print(mape(df_train['target'].values, stacked_train_pred))
print( f'RMSE train: {sqrt(mean_squared_error(y_train, lgb_reg.predict(X_train)))}' ) print( f'\nMAE validate: {mean_absolute_error(y_test, lgb_reg.predict(X_test))}') print( f'RMSE validate: {sqrt(mean_squared_error(y_test, lgb_reg.predict(X_test)))} ' ) joblib.dump(lgb_reg, 'models/lgb_optimized.pkl') #%% lgb_reg.n_features_ lgb_reg.objective_ lgb_reg.get_params lgb_reg.feature_importances_ #%% import matplotlib.pyplot as plt import seaborn as sns sns.set_palette('pastel') lgb.plot_importance(lgb_reg, figsize=(6, 8)) lgb.plot_metric(lgb_reg, figsize=(6, 8)) #%% import matplotlib as mpl mpl.rcParams.update(mpl.rcParamsDefault) #%%
} evals_result = {} # to record eval results for plotting print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(28)], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plot metrics during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plot 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plot 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def test_register_logger(tmp_path): logger = logging.getLogger("LightGBM") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)s | %(message)s') log_filename = str(tmp_path / "LightGBM_test_logger.log") file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8") file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) def dummy_metric(_, __): logger.debug('In dummy_metric') return 'dummy_metric', 1, True lgb.register_logger(logger) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) y = np.array([0, 1, 1, 0]) lgb_data = lgb.Dataset(X, y) eval_records = {} lgb.train({ 'objective': 'binary', 'metric': ['auc', 'binary_error'] }, lgb_data, num_boost_round=10, feval=dummy_metric, valid_sets=[lgb_data], evals_result=eval_records, categorical_feature=[1], early_stopping_rounds=4, verbose_eval=2) lgb.plot_metric(eval_records) expected_log = r""" WARNING | categorical_feature in Dataset is overridden. New categorical_feature is [1] INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant. INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2 INFO | [LightGBM] [Info] Total Bins 0 INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0 INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | Training until validation scores don't improve for 4 rounds INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [2] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [4] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [6] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [8] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [10] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | Did not meet early stopping. Best iteration is: [1] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 WARNING | More than one metric available, picking one to plot. """.strip() gpu_lines = [ "INFO | [LightGBM] [Info] This is the GPU trainer", "INFO | [LightGBM] [Info] Using GPU Device:", "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...", "INFO | [LightGBM] [Info] GPU programs have been built", "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found", "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.", "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] with open(log_filename, "rt", encoding="utf-8") as f: actual_log = f.read().strip() actual_log_wo_gpu_stuff = [] for line in actual_log.split("\n"): if not any(line.startswith(gpu_line) for gpu_line in gpu_lines): actual_log_wo_gpu_stuff.append(line) assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
def lightgbm_model(x_train, x_test, y_train, y_test, group_id): features = x_train.columns.tolist() features.remove('product_id') lgb_train = lgb.Dataset(x_train[features], y_train.qty) lgb_test = lgb.Dataset(x_test[features], y_test.qty, reference=lgb_train) params = { 'objective': 'regression', 'learning_rate': 0.03, 'lambda_l1': 0.5, 'metric': {'mape', 'rmse'}, 'max_depth': 6, 'num_leaves': 64, 'min_data_in_leaf': 30, 'colsample_bytree': 0.7, 'subsample': 0.7, 'subsample_freq': 50, 'verbose': 0 } gridParams = { 'max_depth': [6, 8], 'num_leaves': [64, 126], 'min_child_samples': [30, 40, 50], 'reg_alpha': [0.001, 0.01, 0.03] } print("Start GridSearch for Parameters") lg = lgb.LGBMRegressor(objective='regression', n_jobs=3, n_estimators=1000, silent=True, metric='rmse') grid = GridSearchCV(lg, gridParams, verbose=0, cv=4, n_jobs=3) # convert data to list to fit GridSearch grid.fit(x_train[features], y_train.qty) print(grid.best_params_) print(grid.best_score_) params['max_depth'] = grid.best_params_['max_depth'] params['num_leaves'] = grid.best_params_['num_leaves'] params['min_data_in_leaf'] = grid.best_params_['min_child_samples'] params['lambda_l1'] = grid.best_params_['reg_alpha'] evals_result = {} print("Start Model Training") lg_model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=lgb_test, evals_result=evals_result, verbose_eval=200, early_stopping_rounds=200) from sklearn.externals import joblib joblib.dump(lg_model, 'lg_model.pkl') print("LightGBM Model dumped!") model_columns = list(x_train.columns) joblib.dump(model_columns, 'lg_model_columns.pkl') print("LightGBM Models columns dumped!") # plot training result ax = lgb.plot_metric(evals_result, metric='mape') plt.show() return 1
def main(): # Whether we should invalidate preprocessing and or training data. When invalidated, data will # not be deserialized and instead will be recomputed. invalidate_preprocessing, invalidate_training = parse_args() # Placemark paths. region_pmarks_path = Path(f"./in/placemarks/") label_pmarks_path = Path(f"./in/placemarks/training") ## # Train ## # Preprocessing for training data. training_out_path = Path(f"./out/training/") training_files = list(Path().glob("./in/training/*")) training_sets, training_files = create_training_data( training_files=training_files, training_out_path=training_out_path, region_pmarks_path=region_pmarks_path, label_pmarks_path=label_pmarks_path, invalidate=invalidate_preprocessing) # Write out the labeled kml for inspection. write_routes_kml(routes=training_sets, files=training_files, out_path=training_out_path, file_label="labeled") # Features to use for training. Any delta or rolling_mean column. # Keep columns that have 'delta' or 'rolling_mean' in the name. features = [ col for col in training_sets[0].columns if any([substr in col for substr in ["delta", "rolling_mean"]]) ] # Train the model. fitted_model, label_encoder, evals_result = fit_model( training_sets=training_sets, features=features, invalidate=invalidate_training) # Inspect training if it was recomputed. if invalidate_training: # Save plot of the training metric. training_img = Path("./analysis/training.png") print( f"\nPlotting metrics during training and saving to {training_img.name}." ) lgb.plot_metric(evals_result, metric="multi_logloss") plt.show() # Save plot of feature importances. feature_importances_img = Path("./analysis/feature_importances.png") print( f"\nPlotting feature importances and saving to {feature_importances_img.name}.\n" ) lgb.plot_importance(fitted_model, max_num_features=len(features)) plt.show() ## # Classify ## # Preprocessing for unseen data. unseen_out_path = Path(f"./out/unseen/") unseen_files = list(Path().glob("./in/unseen/*")) unseen_sets, unseen_files = create_unseen_data( unseen_files=unseen_files, unseen_out_path=unseen_out_path, region_pmarks_path=region_pmarks_path, invalidate=invalidate_preprocessing) # Classify the unseen data. classified_unseen_data = classify_unseen_data( model=fitted_model, unseen_sets=unseen_sets, features=features, label_encoder=label_encoder, invalidate=invalidate_training) # Write out the classified kml for inspection. write_routes_kml(routes=classified_unseen_data, files=unseen_files, out_path=unseen_out_path, file_label="classified") ## # Score Routes ## # Make a recommendation for a route to take when going to A or to B. # Routes that do not start at either B or A are ignored. scored_routes = score_routes(classified_unseen_data=classified_unseen_data, files=unseen_files, invalidate=False) # Sort the routes by cost ascending. scored_routes = sorted(scored_routes, key=lambda data: data["cost"]) # Write out the scored routes # Separate the routes by their destination. scored_to_b_routes = [ rdata for rdata in scored_routes if "to_b" in rdata["file"].name ] scored_to_a_routes = [ rdata for rdata in scored_routes if "to_a" in rdata["file"].name ] # Write out the scored routes. scored_out_path = Path("./out/scored_unseen/") write_routes_kml(routes=[data["route"] for data in scored_to_b_routes], files=[data["file"] for data in scored_to_b_routes], out_path=scored_out_path, file_label="scored", stops_as_path=False, turns_as_path=False, altitude_as_speed=False) write_routes_kml(routes=[data["route"] for data in scored_to_a_routes], files=[data["file"] for data in scored_to_a_routes], out_path=scored_out_path, file_label="scored", stops_as_path=False, turns_as_path=False, altitude_as_speed=False) ## # Summarize ## # Training routes KML summary. write_coordinates_summary( kml_filename=Path("./out/training_routes_summary.kml"), coords_list=training_sets, fnames=training_files) # Unseen routes KML summary. write_coordinates_summary( kml_filename=Path("./out/unseen_routes_summary.kml"), coords_list=unseen_sets, fnames=unseen_files) # All routes KML summary. write_coordinates_summary( kml_filename=Path("./out/all_routes_summary.kml"), coords_list=training_sets + unseen_sets, fnames=training_files + unseen_files) # Scored to B routes KML summary. write_coordinates_summary( kml_filename=Path("./out/scored_to_b_summary.kml"), coords_list=[data["route"] for data in scored_to_b_routes], fnames=[data["file"] for data in scored_to_b_routes]) # Scored to A routes KML summary. write_coordinates_summary( kml_filename=Path("./out/scored_to_a_summary.kml"), coords_list=[data["route"] for data in scored_to_a_routes], fnames=[data["file"] for data in scored_to_a_routes]) # Summarize scores. pd.set_option("display.max_colwidth", -1) pd.set_option("display.expand_frame_repr", False) routes_table = create_scored_routes_table(scored_routes) to_b_table = routes_table[routes_table.destination == "B"].drop( "destination", axis=1) to_a_table = routes_table[routes_table.destination == "A"].drop( "destination", axis=1) # Show the route scores for going to B. print("###### To B Scored Routes ######") print(to_b_table) # Show the route scored for going A. print("###### To A Scored Routes ######") print(to_a_table) # Show the route scores.. print("###### All Scored Routes ######") print(routes_table) print()