def test_continue_train(self): X, y = load_boston(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', 'metric': 'l1', 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) model_name = 'model.txt' init_gbm.save_model(model_name) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=30, valid_sets=lgb_eval, verbose_eval=False, # test custom eval metrics feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)), evals_result=evals_result, init_model='model.txt') ret = mean_absolute_error(y_test, gbm.predict(X_test)) self.assertLess(ret, 3.5) self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']): self.assertAlmostEqual(l1, mae, places=5) os.remove(model_name)
def test_plot_metrics(self): test_data = lgb.Dataset(self.X_test, self.y_test, reference=self.train_data) self.params.update({"metric": {"binary_logloss", "binary_error"}}) evals_result0 = {} gbm0 = lgb.train(self.params, self.train_data, valid_sets=[self.train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(self.params, self.train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(self.X_train, self.y_train, eval_set=[(self.X_test, self.y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def test_early_stopping(self): X, y = load_breast_cancer(True) params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1 } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) valid_set_name = 'valid_set' # no early stopping gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, valid_names=valid_set_name, verbose_eval=False, early_stopping_rounds=5) self.assertEqual(gbm.best_iteration, 10) self.assertIn(valid_set_name, gbm.best_score) self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) # early stopping occurs gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, valid_names=valid_set_name, verbose_eval=False, early_stopping_rounds=5) self.assertLessEqual(gbm.best_iteration, 100) self.assertIn(valid_set_name, gbm.best_score) self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001): params = {} params["objective"] = "binary" params['metric'] = 'auc' params["max_depth"] = dep params["min_data_in_leaf"] = 100 params["learning_rate"] = eta params["bagging_fraction"] = 0.7 params["feature_fraction"] = 0.7 params["bagging_freq"] = 5 params["bagging_seed"] = seed_val params["verbosity"] = -1 num_rounds = rounds plst = list(params.items()) lgtrain = lgb.Dataset(train_X, label=train_y) if test_y is not None: lgtest = lgb.Dataset(test_X, label=test_y) model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) else: lgtest = lgb.DMatrix(test_X) model = lgb.train(params, lgtrain, num_rounds) pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) loss = 0 if test_y is not None: loss = metrics.roc_auc_score(test_y, pred_test_y) print loss return pred_test_y, loss, pred_test_y2 else: return pred_test_y, loss, pred_test_y2
def test_feature_name(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) feature_names = ['f_' + str(i) for i in range(13)] gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) self.assertListEqual(feature_names, gbm.feature_name()) # test feature_names with whitespaces feature_names_with_space = ['f ' + str(i) for i in range(13)] gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space) self.assertListEqual(feature_names, gbm.feature_name())
def SubmissionSimple(data, submit=False): features = [x for x in data.columns if x not in drop_list] train = data[data.is_trade.notnull()] test = data[data.is_trade.isnull()] lgb_train = train[(train['day'] >= 18) & (train['day'] < 24)] lgb_valid = train[(train['day'] == 24)] lgb_train = lgb.Dataset(lgb_train[features], lgb_train[target], free_raw_data=False) lgb_valid = lgb.Dataset(lgb_valid[features], lgb_valid[target], reference=lgb_train, free_raw_data=False) print('\ntraining...') gbm = lgb.train(params=params, train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=10000, early_stopping_rounds=500, verbose_eval=100) if submit: print('\npredicting...') test['predicted_score'] = gbm.predict(test[features], num_iteration=gbm.best_iteration) result = test[['instance_id', 'predicted_score']] result = pd.DataFrame(pd.read_csv(wd+test_file[2], sep=' ')['instance_id']).merge(result, on='instance_id', how='left').fillna(0) print('\nsaving...') result.to_csv(wd+output_file, sep=' ', index=False) return gbm
def test_plot_importance(self): gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10) ax0 = lgb.plot_importance(gbm0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Feature importance') self.assertEqual(ax0.get_xlabel(), 'Feature importance') self.assertEqual(ax0.get_ylabel(), 'Features') self.assertLessEqual(len(ax0.patches), 30) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(self.X_train, self.y_train) ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y') self.assertIsInstance(ax1, matplotlib.axes.Axes) self.assertEqual(ax1.get_title(), 't') self.assertEqual(ax1.get_xlabel(), 'x') self.assertEqual(ax1.get_ylabel(), 'y') self.assertLessEqual(len(ax1.patches), 30) for patch in ax1.patches: self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '') self.assertLessEqual(len(ax2.patches), 30) self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b
def test_multiclass_prediction_early_stopping(self): X, y = load_digits(10, True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 10, 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, verbose_eval=False, evals_result=evals_result) pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter)) self.assertLess(ret, 0.8) self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5} ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter)) self.assertLess(ret, 0.2)
def test_template(params={'objective': 'regression', 'metric': 'l2'}, X_y=load_boston(True), feval=mean_squared_error, num_round=100, init_model=None, custom_eval=None, early_stopping_rounds=10, return_data=False, return_model=False): params['verbose'], params['seed'] = -1, 42 X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) if return_data: return lgb_train, lgb_eval evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=num_round, valid_sets=lgb_eval, valid_names='eval', verbose_eval=False, feval=custom_eval, evals_result=evals_result, early_stopping_rounds=early_stopping_rounds, init_model=init_model) if return_model: return gbm else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
def train(self, xtra, ytra, xte, yte): ytra = ytra.ravel() yte = yte.ravel() dtrain = lgb.Dataset(xtra, label=ytra) dvalid = lgb.Dataset(xte, label=yte) watchlist = [(dtrain, 'train'),(dvalid, 'eval')] self.gbdt = lgb.train(self.param, dtrain, self.nrounds)
def main(): res = [] num_iterations = params['num_iterations'] early_stopping_round = params['early_stopping_round'] print(params) for i in range(cnt): train_fea = pd.read_csv(root_path + 'train_score_{}.csv'.format(i)) train_lab = pd.read_csv(root_path + 'label_{}.csv'.format(i)) train_lab = train_lab.loc[:, 'label'].values lgb_train = lgb.Dataset(train_fea, train_lab) solver = lgb.train(params, lgb_train, \ valid_sets=[lgb_train], \ valid_names=['train'], \ verbose_eval=True, \ num_boost_round=num_iterations, \ early_stopping_rounds=early_stopping_round) pred_fea = pd.read_csv(root_path + 'res_score.csv') pred_fea = pred_fea.drop([i], axis=1).values res.append(solver.predict(pred_fea, num_iteration=solver.best_score)) pd.DataFrame(np.array(res).T).to_csv(root_path + \ 'res_score2.csv', index=False) res = np.mean(res, axis=0) pred_pair = pd.read_csv(root_path + 'test1.csv') pred_pair['score'] = res pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x)) pred_pair.to_csv(root_path + 'submission-5000-layer2.csv', index=False)
def test_missing_value_handle_none(self): x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] y = [0, 1, 1, 1, 0, 0, 0, 0, 0] X_train = np.array(x).reshape(len(x), 1) y_train = np.array(y) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_train, y_train) params = { 'objective': 'regression', 'metric': 'auc', 'verbose': -1, 'boost_from_average': False, 'min_data': 1, 'num_leaves': 2, 'learning_rate': 1, 'min_data_in_bin': 1, 'use_missing': False } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, valid_sets=lgb_eval, verbose_eval=True, evals_result=evals_result) pred = gbm.predict(X_train) self.assertAlmostEqual(pred[0], pred[1], places=5) self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_simple(self): # Load a dataset aleady on disk iris = load_iris() lgb_train = lgb.Dataset(iris.data[:100], iris.target[:100]) lgb_eval = lgb.Dataset(iris.data[100:], iris.target[100:], reference=lgb_train) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2', 'auc'}, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } # Run only one round for faster test gbm = lgb.train(params, lgb_train, num_boost_round=1, valid_sets=lgb_eval, early_stopping_rounds=1) self.assertEqual(1, gbm.best_iteration)
def test_categorical_handle2(self): x = [0, np.nan, 0, np.nan, 0, np.nan] y = [0, 1, 0, 1, 0, 1] X_train = np.array(x).reshape(len(x), 1) y_train = np.array(y) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_train, y_train) params = { 'objective': 'regression', 'metric': 'auc', 'verbose': -1, 'boost_from_average': False, 'min_data': 1, 'num_leaves': 2, 'learning_rate': 1, 'min_data_in_bin': 1, 'min_data_per_group': 1, 'cat_smooth': 1, 'cat_l2': 0, 'max_cat_to_onehot': 1, 'zero_as_missing': False, 'categorical_column': 0 } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, valid_sets=lgb_eval, verbose_eval=True, evals_result=evals_result) pred = gbm.predict(X_train) np.testing.assert_almost_equal(pred, y)
def test_reference_chain(self): X = np.random.normal(size=(100, 2)) y = np.random.normal(size=100) tmp_dat = lgb.Dataset(X, y) # take subsets and train tmp_dat_train = tmp_dat.subset(np.arange(80)) tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18)) params = {'objective': 'regression_l2', 'metric': 'rmse'} gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val])
def train_gbm_model(train_days_seq, test_day, model_path, best_num, model_type="cv",): models_usedfeats_df = pd.read_csv( models_usedfeats_path ) best_feats = models_usedfeats_df[ models_usedfeats_df['gbm_feats']==1 ]['feat_name'].values if os.path.exists(model_path): os.remove(model_path); print "Remove file %s."%(model_path) if os.path.exists(model_path): bst = lgb.Booster(model_file=model_path) else: ################################# 训练数据 ########################################## data, labels = get_merge_train_data_set(train_days_seq) data = data[ best_feats ] ################################## 评估数据 ######################################### print "all samples: %d*%d,pos/nag=%f"%(len(data.index),len(data.columns),1.0*len(labels[labels.label==1])/len(labels[labels.label==0])); ################################## 评估数据 ######################################### X_train, X_test, y_train, y_test = train_test_split(data.values, labels['label'].values, test_size=0.15, random_state=0) all_samples_train = lgb.Dataset(data.values, labels['label'].values); deval = lgb.Dataset( X_test, y_test ) columns = data.columns; data = None; params = { 'objective': 'regression', 'boosting_type': 'gbdt', 'num_leaves': 300, 'learning_rate': 0.05, 'verbose': 0, 'metric': {'binary_logloss'}, #'device':'gpu', } num_boost_round = 130 # if model_type == "cv": # print "start cv, please wait ........" # cv_history = lgb.cv(params, all_samples_train, num_boost_round, nfold=5, seed=2017, metrics = {"binary_logloss"}, early_stopping_rounds= 10, callbacks=[lgb.callback.print_evaluation(show_stdv=True)]); # history_df = pd.DataFrame(cv_history) # num_boost_round = len(history_df.index) # else: num_boost_round = 99 # 99 bst = lgb.train(params, all_samples_train, num_boost_round, valid_sets = deval ) bst.save_model(model_path) #feature_importance2file(bst, history_df, num_boost_round, feature_importance_file_path, columns, model_name='gbm') ############################## 查看正确率 ############################################# test_data, test_labels_df = make_train_set(test_day,test_day+1000000) test_data = test_data[ best_feats ] print "all samples: %d*%d."%(len(test_data.index),len(test_data.columns)); y = bst.predict( test_data.values ) report( test_labels_df['label'], y ) test_labels_df, test_data = None, None #优化内存 exit(); ############################ 生成提交文件 ################################### test_data, test_labels = make_train_set(31000000, 32000000, sub=True) instanceID = test_data['instanceID'].copy(); del test_data['instanceID'] test_data = test_data[ best_feats ] print "sub samples: %d*%d"%(len(test_data.index),len(test_data.columns)) y = bst.predict( test_data.values ) pred = pd.concat([instanceID, pd.Series(y, name='prob')], axis=1) pred = pred.sort_values('instanceID',ascending=True) fun = lambda x: 0.0 if x < 0 else x # 为什么预测的还有负值 pred['prob'] = pred['prob'].map(fun) pred.to_csv('./sub/submission.csv', index=False, index_label=False)
def lgb_train_predict(train_x, train_y, test_x, params, rounds): start = time.clock() log(str(train_x.columns)) dtrain = lgb.Dataset(train_x, label=train_y) valid_sets = [dtrain] model = lgb.train(params, dtrain, rounds, valid_sets, feval=eval_auc_f1, verbose_eval=5) pred = model.predict(test_x) elapsed = (time.clock() - start) log('Time used:' + str(elapsed) + 's') return model, pred
def train(self, x_train, y_train, x_val, y_val): print('train with lgb model') lgbtrain = lgb.Dataset(x_train, y_train) lgbval = lgb.Dataset(x_val, y_val) model = lgb.train(self.params, lgbtrain, valid_sets = lgbval, verbose_eval = self.num_boost_round, num_boost_round = self.num_boost_round) early_stopping_rounds = self.early_stopping_rounds)
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc', feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None): lgb_params = { 'boosting_type': 'gbdt', 'objective': objective, 'metric': metrics, 'learning_rate': 0.2, 'num_leaves': 31, # we should let it be smaller than 2^(max_depth) 'max_depth': -1, # -1 means no limit 'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 255, # Number of bucketed bin for feature values 'subsample': 0.6, # Subsample ratio of the training instance. 'subsample_freq': 0, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'subsample_for_bin': 200000, # Number of samples for constructing bin 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 0, # L1 regularization term on weights 'reg_lambda': 0, # L2 regularization term on weights 'nthread': 4, 'verbose': 0, 'metric': metrics } lgb_params.update(params) print("preparing validation datasets") xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values, feature_name=predictors, categorical_feature=categorical_features ) xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values, feature_name=predictors, categorical_feature=categorical_features ) evals_results = {} bst1 = lgb.train(lgb_params, xgtrain, valid_sets=[xgtrain, xgvalid], valid_names=['train', 'valid'], evals_result=evals_results, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=10, feval=feval) print("\nModel Report") print("bst1.best_iteration: ", bst1.best_iteration) print(metrics + ":", evals_results['valid'][metrics][bst1.best_iteration - 1]) return (bst1, bst1.best_iteration)
def main(): num_iterations = params['num_iterations'] early_stopping_round = params['early_stopping_round'] print(params) for i in range(cnt): train_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(i)) train_lab = pd.read_csv(root_path + \ 'label_{}.csv'.format(i)).loc[:, 'label'].values valid_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format((i + 1) % cnt)) valid_lab = pd.read_csv(root_path + \ 'label_{}.csv'.format((i + 1) % cnt)).loc[:, 'label'].values lgb_train = lgb.Dataset(train_fea, label=train_lab) lgb_valid = lgb.Dataset(valid_fea, label=valid_lab, reference=lgb_train) print('training cnt={}/{}'.format(i + 1, cnt)) solver = lgb.train(params, lgb_train, \ valid_sets=[lgb_train, lgb_valid], \ valid_names=['train', 'valid'], \ verbose_eval=True, \ num_boost_round=num_iterations, \ early_stopping_rounds=early_stopping_round) pred_fea = scipy.sparse.load_npz(root_path + 'pred_{}.npz'.format(i)) pred_label = solver.predict(pred_fea, num_iteration=solver.best_iteration) if os.path.exists(root_path + 'res_score.csv'): res = list(pd.read_csv(root_path + 'res_score.csv').values.T) else: res = [] res.append(pred_label) pd.DataFrame(np.array(res).T).to_csv(root_path + \ 'res_score.csv', index=False) for j in range(cnt): if j == i: continue pred_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(j)) pred_label = solver.predict(pred_fea, num_iteration=solver.best_iteration) if os.path.exists(root_path + 'train_score_{}.csv'.format(j)): train_res = list(pd.read_csv(root_path + \ 'train_score_{}.csv'.format(j)).values.T) else: train_res = [] train_res.append(pred_label) pd.DataFrame(np.array(train_res).T).to_csv(root_path + \ 'train_score_{}.csv'.format(j), index=False) gc.collect() res = np.mean(res, axis=0) pred_pair = pd.read_csv(root_path + 'test1.csv') pred_pair['score'] = res pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x)) pred_pair.to_csv(root_path + 'submission-5000.csv', index=False)
def test_template(init_model=None, return_model=False): X, y = load_boston(True) params = { 'objective': 'regression', 'metric': 'l2', 'verbose': -1 } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model) return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
def test_pandas_categorical(self): import pandas as pd X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str "B": np.random.permutation([1, 2, 3] * 100), # int "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float "D": np.random.permutation([True, False] * 150)}) # bool y = np.random.permutation([0, 1] * 150) X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), "B": np.random.permutation([1, 3] * 30), "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), "D": np.random.permutation([True, False] * 30)}) for col in ["A", "B", "C", "D"]: X[col] = X[col].astype('category') X_test[col] = X_test[col].astype('category') params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1 } lgb_train = lgb.Dataset(X, y) gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False) pred0 = list(gbm0.predict(X_test)) lgb_train = lgb.Dataset(X, y) gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, categorical_feature=[0]) pred1 = list(gbm1.predict(X_test)) lgb_train = lgb.Dataset(X, y) gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, categorical_feature=['A']) pred2 = list(gbm2.predict(X_test)) lgb_train = lgb.Dataset(X, y) gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, categorical_feature=['A', 'B', 'C', 'D']) pred3 = list(gbm3.predict(X_test)) gbm3.save_model('categorical.model') gbm4 = lgb.Booster(model_file='categorical.model') pred4 = list(gbm4.predict(X_test)) np.testing.assert_almost_equal(pred0, pred1) np.testing.assert_almost_equal(pred0, pred2) np.testing.assert_almost_equal(pred0, pred3) np.testing.assert_almost_equal(pred0, pred4)
def test_plot_metrics(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) test_data = lgb.Dataset(X_test, y_test, reference=train_data) params = { "objective": "binary", "metric": {"binary_logloss", "binary_error"}, "verbose": -1, "num_leaves": 3 } evals_result0 = {} gbm0 = lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(params, train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=10, seed=0, rounds=20000): params = {} params["objective"] = "regression" params['metric'] = 'rmse' params["max_depth"] = dep params["min_data_in_leaf"] = 100 params["learning_rate"] = 0.04 params["bagging_fraction"] = 0.7 params["feature_fraction"] = 0.5 params["bagging_freq"] = 5 params["bagging_seed"] = seed #params["lambda_l2"] = 0.01 params["verbosity"] = -1 num_rounds = rounds plst = list(params.items()) lgtrain = lgb.Dataset(train_X, label=train_y) if test_y is not None: lgtest = lgb.Dataset(test_X, label=test_y) model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100) else: lgtest = lgb.Dataset(test_X) model = lgb.train(params, lgtrain, num_rounds) pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) if test_X2 is not None: pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) imps = model.feature_importance() names = model.feature_name() for fi, fn in enumerate(names): print(fn, imps[fi]) loss = 0 if test_y is not None: loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y)) print(loss) return pred_test_y, loss, pred_test_y2, model.best_iteration else: return pred_test_y
def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test): dtrain = lgbm.Dataset(kfold_X_train, label=y_train) dwatch = lgbm.Dataset(kfold_X_valid, label=y_test) best = lgbm.train(self.params, dtrain, num_boost_round=4000, verbose_eval=100, valid_sets=dwatch, early_stopping_rounds=100) # 对验证集predict pred = best.predict(kfold_X_valid) results = best.predict(test) return pred, results, best
def test_continue_train_multiclass(self): X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 3, 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=30, valid_sets=lgb_eval, verbose_eval=False, evals_result=evals_result, init_model=init_gbm) ret = multi_logloss(y_test, gbm.predict(X_test)) self.assertLess(ret, 1.5) self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
def prepare_model(self, obj_fn=None, num_steps: int = 0, model_params=None, batch_size: int = None): data_train = self.ds[self.data_groups["data_train_group"]].to_ndarray() target_train = self.ds[self.data_groups["target_train_group"]].to_ndarray() data_val = self.ds[self.data_groups["data_validation_group"]].to_ndarray() target_val = self.ds[self.data_groups["target_validation_group"]].to_ndarray() columns = None data_train_ds = lgb.Dataset(data_train, label=target_train, feature_name=columns) data_valid_ds = lgb.Dataset(data_val, label=target_val, feature_name=columns) num_round = num_steps bst = lgb.train(model_params, data_train_ds, num_round, valid_sets=[data_valid_ds], early_stopping_rounds=num_round/2, feval=obj_fn, verbose_eval=True) return self.ml_model(lgb, bst=bst)
def scores_cv_params(params): train_scores=[] valid_scores=[] for train_ind,valid_ind in zip(train_index,valid_index): train_x = train_data[train_ind,:] train_y = train_label[list(train_ind)] valid_x = train_data[valid_ind,:] valid_y = train_label[list(valid_ind)] df_train = lgb.Dataset(train_x,label=train_y) lmodel = lgb.train(params,df_train) valid_pred = lmodel.predict(valid_x) valid_scores.append(metrics.mean_squared_error(valid_y,valid_pred)) return np.mean(valid_scores)
def train_and_get_predictions(features, labels): dataset = lgb.Dataset(features, label=labels) lgb_params = { 'application': 'binary', 'verbose': -1, 'min_data': 5, } lgbm_model = lgb.train( params=lgb_params, train_set=dataset, num_boost_round=10, ) predictions = lgbm_model.predict(features) return predictions
def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round): n_classes = train_labels.shape[1] y_val_pred = np.zeros((validation_features.shape[0], n_classes)) time_results = defaultdict(list) for class_i in tqdm(range(n_classes)): lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False) with Timer() as t: model = lgb.train(params, lgb_train, num_boost_round = num_boost_round) time_results['train_time'].append(t.interval) with Timer() as t: y_val_pred[:, class_i] = model.predict(validation_features) time_results['test_time'].append(t.interval) return y_val_pred, time_results
def main_lgbm(fold_offset): for fold_id, (train_index, test_index) in enumerate( KFold(n_splits=10).split(all_races_train)): all_races_train_train = all_races_train[train_index] all_races_train_valid = all_races_train[test_index] all_races_rank_train_train = [] all_races_query_train_train = [] all_races_target_train_train = [] all_races_rank_train_valid = [] all_races_query_train_valid = [] all_races_target_train_valid = [] get_race_gets(all_races_train_train, all_races_rank_train_train, all_races_query_train_train, all_races_target_train_train) get_race_gets(all_races_train_valid, all_races_rank_train_valid, all_races_query_train_valid, all_races_target_train_valid) all_races_rank_train_train = np.array(all_races_rank_train_train) all_races_query_train_train = np.array(all_races_query_train_train) all_races_target_train_train = np.array(all_races_target_train_train) all_races_rank_train_valid = np.array(all_races_rank_train_valid) all_races_query_train_valid = np.array(all_races_query_train_valid) all_races_target_train_valid = np.array(all_races_target_train_valid) lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'lambdarank', 'metric': 'ndcg', # for lambdarank 'ndcg_eval_at': [1, 2, 3], # for lambdarank 'max_position': max_position, # for lambdarank 'learning_rate': 1e-8, 'min_data': 1, 'min_data_in_bin': 1, } lgtrain = lgb.Dataset(all_races_rank_train_train, all_races_target_train_train, categorical_feature=[0, 1, 2, 3, 4, 7] + list(range(8, 23)), group=all_races_query_train_train) lgvalid = lgb.Dataset(all_races_rank_train_valid, all_races_target_train_valid, categorical_feature=[0, 1, 2, 3, 4, 7] + list(range(8, 23)), group=all_races_query_train_valid) lgb_clf = lgb.train(lgbm_params, lgtrain, categorical_feature=[0, 1, 2, 3, 4] + list(range(6, 21)), num_boost_round=10, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=2, verbose_eval=1) if len(test_src) > 0: dst = norm_racedata(lgb_clf.predict(all_races_rank_test), all_races_query_test) for dst_ind in range(len(dst)): test_validation_regression[dst_ind][fold_offset + fold_id] = dst[dst_ind] cur_pos = 0 if len(in_data) != 0 and len(in_meta) != 0: dst = norm_racedata(lgb_clf.predict(predict_races_target), [len(predict_races_target)]) for dst_ind in range(len(dst)): predict_validation_regression[dst_ind][fold_offset + fold_id] = dst[dst_ind]
def epoch_train(self, dataloader, run_num, is_multi_label=False, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['lgb'].copy() self.hyperparams['seed'] = np.random.randint(0, 2020) num_leaves = self.hyperparams['num_leaves'] self.hyperparams['num_leaves'] += np.random.randint( -int(num_leaves / 10), int(num_leaves / 10)) run_num = 0 if run_num == self.explore_params_round: print('lgb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.log_feat_importances() if train_x.shape[1] > 300 and train_x.shape[0] > 20000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[0] > 20000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') print('shape: ', train_x.shape) self.bayes_opt(train_x, val_x, train_y, val_y, cat, phase=1) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['lgb'] = self.hyperparams.copy() info['imp_cols'] = self.import_cols if run_num == self.ensemble_num: print('lgb ensemble_num') splits = dataloader['splits'] for i in range(len(splits)): train_idxs, val_idxs = splits[i] train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() # num_leaves = hyperparams['num_leaves'] # num_leaves += np.random.randint(-int(num_leaves/10), int(num_leaves/10)) # hyperparams['num_leaves'] = num_leaves # log('model {} leaves {}'.format(i, num_leaves)) if self.is_multi_label: self.en_models = defaultdict(list) for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train)) else: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates)) else: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train) else: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.ensemble_pred = True else: print('lgb norm train') train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() log('hyperparams {}'.format(hyperparams)) if run_num == self.all_data_round_pre or run_num == self.all_data_round: print('lgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] print('shape: ', train_x.shape) if not is_multi_label: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train) else: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) else: self.params['num_class'] = 2 for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train) else: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.log_feat_importances() if self.imp_nums is not None: info['imp_nums'] = self.imp_nums
'data_random_seed': 1, 'bagging_fraction': 0.5, 'nthread': 4 } params2 = { 'learning_rate': 0.85, 'application': 'regression', 'max_depth': 3, 'num_leaves': 130, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 2, 'bagging_fraction': 1, 'nthread': 4 } model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) X_test = get_feature(test_x) predsL = model.predict(X_test) print('[{}] Predict lgb 1 completed.'.format(time.time() - start_time)) train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size=0.1, random_state=101) d_train2 = lgb.Dataset(train_X2, label=train_y2) d_valid2 = lgb.Dataset(valid_X2, label=valid_y2) watchlist2 = [d_train2, d_valid2] model = lgb.train(params2, train_set=d_train2, num_boost_round=6000, valid_sets=watchlist2, \ early_stopping_rounds=500, verbose_eval=500) predsL2 = model.predict(X_test) print('[{}] Predict lgb 2 completed.'.format(time.time() - start_time)) preds = (predsL * 0.5 + predsL2 * 0.5)
def main(): start_time = time.time() # stop-word, can add any wording I want to replace stopwords = set([ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn', '&', 'brand new', 'new', '[rm]', 'free ship.*?', 'rm', 'price firm', 'no description yet' ]) pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*') train = pd.read_csv('../input/train.tsv', sep="\t", encoding='utf-8', converters={ 'item_description': lambda x: pattern.sub('', x.lower()), 'name': lambda x: pattern.sub('', x.lower()) }) print("finished to load train file : {}".format(time.time() - start_time)) test = pd.read_csv('../input/test.tsv', sep="\t", encoding='utf-8', converters={ 'item_description': lambda x: pattern.sub('', x.lower()), 'name': lambda x: pattern.sub('', x.lower()) }) print("finished to load test file : {}".format(time.time() - start_time)) train_label = np.log1p(train['price']) print("finished to log price : {}".format(time.time() - start_time)) train_texts = train['name'].tolist() test_texts = test['name'].tolist() handle_missing(train) handle_missing(test) print("finished to handle missing : {}".format(time.time() - start_time)) handle_nm_word_len(train) handle_nm_word_len(test) handle_desc_word_len(train) handle_desc_word_len(test) handle_nm_len(train) handle_nm_len(test) handle_desc_len(train) handle_desc_len(test) print("finished to handle len : {}".format(time.time() - start_time)) # print(train.describe()) nrow_train = train.shape[0] handle_category(train) handle_category(test) print("finished to handle category : {}".format(time.time() - start_time)) count = CountVectorizer(min_df=NAME_MIN_DF) X_name_mix = count.fit_transform(train['name'].append(test['name'])) X_name = X_name_mix[:nrow_train] X_t_name = X_name_mix[nrow_train:] tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION, ngram_range=(1, 3), stop_words='english') gc.collect() X_description_mix = tv.fit_transform(train['item_description'].append( test['item_description'])) print("finished to handle tfidf : {}".format(time.time() - start_time)) X_description = X_description_mix[:nrow_train] X_t_description = X_description_mix[nrow_train:] print("finished to handle description : {}".format(time.time() - start_time)) #handle label encoder cat_features = [ 'subcat_2', 'subcat_1', 'subcat_0', 'brand_name', 'category_name', 'item_condition_id', 'shipping' ] handle_laberencoder(train, test, cat_features) X_cat, X_test_cat = handle_onehot(train, test, cat_features) # print(train.describe()) print("finished to label encoder : {}".format(time.time() - start_time)) train_feature = ['desc_word_len', 'nm_word_len', 'desc_len', 'nm_len'] train_list = [train[train_feature].values, X_description, X_name, X_cat] test_list = [ test[train_feature].values, X_t_description, X_t_name, X_test_cat ] X = ssp.hstack(train_list).tocsr() X_test = ssp.hstack(test_list).tocsr() print("finished to handle features : {}".format(time.time() - start_time)) kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=128) learning_rate = 0.8 num_leaves = 128 min_data_in_leaf = 1000 feature_fraction = 0.5 bagging_fraction = 0.9 bagging_freq = 1000 num_boost_round = 1000 params = { "objective": "regression", "boosting_type": "gbdt", "learning_rate": learning_rate, "num_leaves": num_leaves, "feature_fraction": feature_fraction, "bagging_freq": bagging_freq, "bagging_fraction": bagging_fraction, "verbosity": 0, "metric": "l2_root", "nthread": 4, "subsample": 0.9 } test_id = test['test_id'] cv_pred = np.zeros(len(test_id)) kf = kfold.split(X) for i, (train_fold, test_fold) in enumerate(kf): X_train, X_validate, label_train, label_validate = \ X[train_fold, :], X[test_fold, :], train_label[train_fold], train_label[test_fold] dtrain = lgbm.Dataset(X_train, label_train) print('dtrain time: {}'.format(time.time() - start_time)) dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain) print('dvalid time: {}'.format(time.time() - start_time)) bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100, early_stopping_rounds=100) print('train time: {}'.format(time.time() - start_time)) cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration) print('predict time', time.time() - start_time) gc.collect() cv_pred /= NFOLDS cv_pred = np.expm1(cv_pred) submission = test[["test_id"]] submission["price"] = cv_pred submission.to_csv("./submission.csv", index=False) print('done', time.time() - start_time)
def done(istrain=True): # op=['num_trees','max_depth','max_bin','bagging_fraction','lambda'] # cv_params['num_trees'] = 315 cv_params['num_trees'] = 300 # cv_params['num_leaves'] = 50 # cv_params['max_depth'] = 6 # op=['max_bin','bagging_fraction','lambda'] op = ['x'] ### 开始训练 logging.debug('设置参数') if istrain: for i in [100, 799]: train_save, val_save, val_x, val_y = tiny_lightgbm_data_get_train( i) for oper in op: logging.debug("CV:" + oper) modelfit_cv(train_save, cv_type=oper) ret = dump( cv_params, FLAGS.out_data_path + 'cv_params_' + oper + 'lgbm.joblib_dat') logging.debug("开始训练") try: init_model = load(FLAGS.out_data_path + '1-' + str(i) + '-lgbm.model.joblib_dat') except: init_model = None gbm = lgb.train( cv_params, # 参数字典 train_save, # 训练集 num_boost_round=1000, # 迭代次数 valid_sets=val_save, # 验证集 init_model=init_model, # init_model=None, # learning_rates=0.01, verbose_eval=True, early_stopping_rounds=60) # 早停系数 logging.debug("to save validation predictions ...") ret = dump( gbm, FLAGS.out_data_path + '1-' + str(i) + '-lgbm.model.joblib_dat') logging.debug(ret) ### 验证 logging.debug("验证") preds_offline = gbm.predict( val_x, num_iteration=gbm.best_iteration) # 输出概率 logging.debug('log_loss:') logging.debug(log_loss(val_y, preds_offline)) ### 特征选择 df = pd.DataFrame(val_x.columns.tolist(), columns=['feature']) df['importance'] = list(gbm.feature_importance()) # 特征分数 df = df.sort_values(by='importance', ascending=False) # 特征排序 df.to_csv(FLAGS.out_data_path + 'feature_score.csv', index=None, encoding='utf-8') # 保存分数 del train_save, val_save, val_x, val_y else: for i in [100, 799]: gbm = load(FLAGS.out_data_path + '1-' + str(i) + '-lgbm.model.joblib_dat') # logging.debug(gbm.get_params()) ### 线下预测 test_save = tiny_lightgbm_data_get_test() logging.debug("预测") dtrain_predprob = gbm.predict( test_save, num_iteration=gbm.best_iteration) # 输出概率 logging.debug(dtrain_predprob) y_pred = [round(value, 4) for value in dtrain_predprob] logging.debug('-' * 30) y_pred = np.array(y_pred).reshape(-1, 1) logging.debug(y_pred.shape) test_id = pd.read_csv(FLAGS.test_id_path + 'test_id.csv') logging.debug(test_id['id'].shape) test_id['id'] = test_id['id'].map(int) test_id['click'] = y_pred test_id.to_csv(FLAGS.out_data_path + '1-' + str(i) + '-lgbm.test.csv', index=False) del test_save
'metric': {'binary_logloss'}, 'num_leaves': 63, 'num_trees': 100, 'learning_rate': 0.01, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0, } # number of leaves,will be used in feature transformation num_leaf = 63 print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_train) print('Save model...') # save model to file gbm.save_model('model.txt') print('Start predicting...') # predict and get data on leaves, training data y_pred = gbm.predict(X_train, pred_leaf=True) # feature transformation and write result print('Writing transformed training data') transformed_training_matrix = np.zeros( [len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64) for i in range(0, len(y_pred)): temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i])
def train(x_train, y_train, x_valid, y_valid): usecols = x_train.columns.values cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) all_params = { 'min_child_weight': [25], 'subsample': [0.7], 'subsample_freq': [1], 'seed': [114], 'colsample_bytree': [0.6], 'learning_rate': [0.1], 'max_depth': [-1], 'min_split_gain': [0.001], 'reg_alpha': [0.0001], 'max_bin': [2047], 'num_leaves': [127], 'objective': ['binary'], 'metric': [['binary_logloss', 'auc']], 'scale_pos_weight': [1], 'verbose': [-1], } use_score = 0 min_score = (100, 100, 100) for params in tqdm(list(ParameterGrid(all_params))): cnt = -1 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) if 1: cnt += 1 trn_x = x_train val_x = x_valid trn_y = y_train val_y = y_valid train_data = lgb.Dataset( trn_x.values.astype(np.float32), label=trn_y, categorical_feature=CAT_FEAT, feature_name=x_train.columns.values.tolist()) test_data = lgb.Dataset( val_x.values.astype(np.float32), label=val_y, categorical_feature=CAT_FEAT, feature_name=x_train.columns.values.tolist()) del trn_x gc.collect() clf = lgb.train( params, train_data, 10000, # params['n_estimators'], early_stopping_rounds=30, valid_sets=[test_data], # feval=cst_metric_xgb, # callbacks=[callback], verbose_eval=10) pred = clf.predict(val_x) #all_pred[test] = pred _score2 = log_loss(val_y, pred) _score = -roc_auc_score(val_y, pred) logger.info(' _score: %s' % _score) logger.info(' _score2: %s' % _score2) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != 0: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) gc.collect() score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) if min_score[use_score] > score[use_score]: min_score = score min_params = params imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances_0.csv') del val_x del trn_y del val_y del train_data del test_data gc.collect() trees = np.mean(list_best_iter) x_train = pd.concat([x_train, x_valid], axis=0, ignore_index=True) y_train = np.r_[y_train, y_valid] del x_valid del y_valid gc.collect() train_data = lgb.Dataset(x_train.values.astype(np.float32), label=y_train, categorical_feature=CAT_FEAT, feature_name=x_train.columns.values.tolist()) del x_train gc.collect() clf = lgb.train(min_params, train_data, int(trees * 1.1), valid_sets=[train_data], verbose_eval=10) #del x_train gc.collect() return min_params
t0 = t - t1 print('val size:', t, 'number of 1:', t1, 'number of 0:', t0) print('val: 1 in all:', t1/t, '0 in all:', t0/t, '1/0:', t1/t0) print() print() train_set = lgb.Dataset(X_tr, Y_tr) val_set = lgb.Dataset(X_val, Y_val) del X_tr, Y_tr, X_val, Y_val print('Training...') model = lgb.train(params, train_set, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, valid_sets=val_set, verbose_eval=verbose_eval, ) print('best score:', model.best_score['valid_0']['auc']) print('best iteration:', model.best_iteration) print() time_elapsed = time.time() - since print('[timer]: complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
lgb_eval = lgb.Dataset(data=X_test, label=y_test, reference=lgb_train, free_raw_data=False) evals_result={} params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 31, 'learning_rate': 0.05, 'verbose': -1 } md = XGBClassifier() mdl = lgb.train(params, lgb_train, valid_sets = lgb_eval, num_boost_round= 150, early_stopping_rounds= 25, evals_result=evals_result) plot_model_performance(mdl, X_test, y_test) fair = get_fair_metrics_and_plot(data_orig_test, mdl) ### Reweighing RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) data_transf_train = RW.fit_transform(data_orig_train) # Train and save the model
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': len(tags2ids), 'min_data_in_bin': 1, 'min_data': 1, 'num_leaves': 31, 'learning_rate': 0.1, 'feature_fraction': 1, 'verbose': 0 } gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=lgb_eval, early_stopping_rounds=20) gbm.save_model('../model/LGBPC_model.txt') # yprob = gbm.predict(X_test).reshape(y_test.shape[0], len(tags2ids)) yprob = gbm.predict(X_test) ylabel = np.argmax(yprob, axis=1) error = sum( int(ylabel[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test)) acc = 1. - error print ('predicting, classification acc=%f' % (acc))
def fit_and_predict(self, X_train, X_test, y_train, groups): if self.cv == "mcs": folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100) elif self.cv == "group": folds = GroupKFold(n_splits=10) elif self.cv == "stratified": folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) y_to_stratify = pd.cut(y_train["Global_Sales_log1p"], bins=7, labels=False) oof = np.zeros(len(X_train)) predictions = np.zeros(len(X_test)) feature_importance_df = pd.DataFrame() fold_scores = [] # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)): for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)): self.logger.debug("-" * 100) self.logger.debug(f"Fold {fold+1}") train_data = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx]) val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx]) callbacks = [log_evaluation(self.logger, period=100)] clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks) #, feval=eval_func) oof[val_idx] = clf.predict(X_train.iloc[val_idx].values, num_iteration=clf.best_iteration) fold_score = mean_squared_log_error( np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx]))**.5 fold_scores.append(fold_score) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X_train.columns.values fold_importance_df["importance"] = clf.feature_importance( importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += np.expm1( clf.predict(X_test, num_iteration=clf.best_iteration)) / folds.n_splits _feature_importance_df = feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False) # .head(50) self.logger.debug("##### feature importance #####") self.logger.debug(_feature_importance_df.head(50)) cv_score_fold_mean = sum(fold_scores) / len(fold_scores) self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}") # # RETRAIN # # exp057 # # RETRAIN # k = 500 # topk_features = _feature_importance_df.index[:k] # self.logger.debug(f"selected {len(topk_features)} features: {topk_features}") # oof = np.zeros(len(X_train)) # predictions = np.zeros(len(X_test)) # feature_importance_df = pd.DataFrame() # fold_scores = [] # # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)): # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)): # self.logger.debug("-" * 100) # self.logger.debug(f"Fold {fold+1}") # train_data = lgb.Dataset(X_train.loc[train_idx, topk_features], label=y_train.iloc[train_idx]) # val_data = lgb.Dataset(X_train.loc[val_idx, topk_features], label=y_train.iloc[val_idx]) # callbacks = [log_evaluation(self.logger, period=100)] # clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks) #, feval=eval_func) # oof[val_idx] = clf.predict(X_train.loc[val_idx, topk_features].values, num_iteration=clf.best_iteration) # fold_score = mean_squared_log_error(np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx])) ** .5 # fold_scores.append(fold_score) # fold_importance_df = pd.DataFrame() # fold_importance_df["feature"] = topk_features # fold_importance_df["importance"] = clf.feature_importance(importance_type="gain") # fold_importance_df["fold"] = fold + 1 # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) # predictions += np.expm1(clf.predict(X_test[topk_features], num_iteration=clf.best_iteration)) / folds.n_splits # feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50) # self.logger.debug("##### feature importance #####") # self.logger.debug(feature_importance_df) # cv_score_fold_mean = sum(fold_scores) / len(fold_scores) # self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}") return predictions, cv_score_fold_mean
'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'min_split_gain': 0.0970905919552776, 'min_child_weight': 9.42012323936088, } for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)): train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx] valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx] lgb_train = lgb.Dataset(train_x, label=train_y) lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train) gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=[lgb_train, lgb_eval], verbose_eval=50) oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values]) train['sex_age_bin_prob_oof_' + str(sex_age)] = oof_preds #用全部的train来预测test lgb_train = lgb.Dataset(X, label=Y) gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=lgb_train, verbose_eval=50)
for loop in range(loop_num): kf = StratifiedKFold(train.label,n_folds = every_loop_num, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf): print('第{}-{}次训练...'.format(loop,i)) train_feat1 = train.iloc[train_index].copy() train_feat2 = train.iloc[test_index].copy() lgb_train1 = lgb.Dataset(train_feat1[predictor], train_feat1['label']) lgb_train2 = lgb.Dataset(train_feat2[predictor], train_feat2['label'] ) gbm = lgb.train(params, lgb_train1, num_boost_round=5000, valid_sets=lgb_train2, verbose_eval=100, feval =f1_error, early_stopping_rounds=300) feat_imp = pd.Series(gbm.feature_importance(), index=predictor).sort_values(ascending=False) lgb_pre = gbm.predict(train_feat2[predictor]) train_preds[test_index,loop] = lgb_pre lgb_pre_test = gbm.predict(test[predictor]) test_preds[:,i+loop*every_loop_num] = lgb_pre_test print('线下train得分: {}'.format(check_f1(train_preds[:,loop],train['label']))) scores[loop]=check_f1(train_preds[:,loop],train['label']) #%%
############################################################################### # Data Location ############################################################################### Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfinal, YtrainData, train_size=0.90, test_size=0.1) localTrainData = lgb.Dataset( Xtrain, Ytrain, ) params = {'max_depth': 14, 'num_leaves': 2048} localModel = lgb.train(params, localTrainData, num_boost_round=500) joblib.dump(localModel, GlobalDirectory + 'lgbModel.pkl') localPrediction = localModel.predict(Xtest) currentPerformance = CurrentModelMetric(Ytest, localPrediction) ############################################################################### # Test ############################################################################### WeatherTestDataDir = GlobalDirectory + 'weather_test.csv' TestDataDir = GlobalDirectory + 'test.csv' WeatherTest = pd.read_csv(WeatherTestDataDir) TestData = pd.read_csv(TestDataDir)
dfTest = df_train['target_2015'].values #grupos = np.quantile(df_train["target_2015"], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]) fold = KFold(n_splits=10, shuffle = True, random_state = 1992) for train_index, test_index in fold.split(dfTrain): X_train, X_test = dfTrain.loc[train_index], dfTrain.loc[test_index] y_train, y_test = dfTest[train_index], dfTest[test_index] train_data = lgb.Dataset(X_train, label=y_train) test_data = lgb.Dataset(X_test, label=y_test) clf = lgb.train(params = params, early_stopping_rounds = 500, verbose_eval = 200, train_set = train_data, valid_sets = test_data) y_pred = clf.predict(X_test) print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred))) errlgb.append(np.sqrt(mean_squared_error(y_test, y_pred))) p = clf.predict(Xtest) y_pred_totlgb.append(p) # Predicciones promedio predichos = np.mean(y_pred_totlgb,0)
def objective(trial, X_train, y_train, params, class_weight_map): # x_train, y_train: ndarray start_time = timer() global exp_counter exp_counter += 1 param_update = { # api doc - https://lightgbm.readthedocs.io/en/latest/Parameters.html#max_depth 'learning_rate': 0.06, # trial.suggest_float('learning_rate', 1e-4, 1e-2), 'max_depth': trial.suggest_int('max_depth', 1, 127), # default: -1 (no limit) 'num_leaves': trial.suggest_int('num_leaves', 15, 255), # default: 31. Total num of leaves in one tree. "lambda_l1": trial.suggest_float("lambda_l1", 1e-7, 1.0, log=True), "lambda_l2": trial.suggest_float("lambda_l2", 1e-7, 1.0, log=True), "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0), "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0), "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), "min_child_samples": trial.suggest_int("min_child_samples", 10, 1000) # 'num_leaves': trial.suggest_categorical('num_leaves', [31, 63, 127, 255]), # default: 31 # 'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0), # default: 0. lambda_l1. # 'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0), # default: 0. lambda_l2. # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9), # feature fraction. # 'min_child_samples': trial.suggest_int('min_child_samples', 1, 300), # min_data_in_leaf. # 'subsample_freq': trial.suggest_int('subsample_freq', 1, 10), # NOTE definition - With subsample (or bagging_fraction) you can specify the percentage of rows used per tree building iteration. # 'subsample': trial.suggest_float('subsample', 0.3, 0.9), # https://lightgbm.readthedocs.io/en/latest/Parameters.html # # 'max_bin': trial.suggest_int('max_bin', 128, 1024), # default: 255. smaller more power to deal with overfitting # 'max_bin': trial.suggest_categorical('max_bin', [15, 31, 63, 127, 255]), # default: 255. smaller more power to deal with overfitting # 'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200), # default: 100 # 'cat_smooth': trial.suggest_int('cat_smooth', 10, 100), # 'cat_l2': trial.suggest_int('cat_l2', 1, 20) # L2 regularization in categorical split } params.update(param_update) losses = [] # pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc") # depends on the choice of eval_metric; "validation_0-logloss" rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=SEED) pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss") for i, (train_index, valid_index) in enumerate(rskf.split(X_train, y_train)): print(f"{exp_counter} - {i}") X_A, X_B = X_train.iloc[train_index, :], X_train.iloc[valid_index, :] y_A, y_B = y_train.iloc[train_index], y_train.iloc[valid_index] # # It doesn't work. # smo_tek = SMOTETomek(random_state=0) # X_smotek, y_smotek = smo_tek.fit_resample(X_A, y_A) lgb_train = lgb.Dataset(X_A, y_A, weight=y_A.map(class_weight_map)) # https://tinyurl.com/yzdao9nr # lgb_train = lgb.Dataset(X_smotek, y_smotek) # it doesn't work lgb_valid = lgb.Dataset(X_B, y_B, reference=lgb_train) lgbm_model = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid_0'], num_boost_round=10000, verbose_eval = False, # https://tinyurl.com/yhdmtdm8 early_stopping_rounds=20, callbacks=[pruning_callback] ) # lgbmClassifier = lgb.LGBMClassifier(**params) # lgbmClassifier.fit( # X_A, y_A, eval_set=[(X_B, y_B)], # early_stopping_rounds=EARLY_STOPPING_ROUNDS, # verbose=VERBOSE, # callbacks=[pruning_callback]) y_oof = lgbm_model.predict(X_B) # not needed, num_iteration=lgbm_model.best_iteration losses.append(log_loss(y_B, y_oof)) trial.set_user_attr(key="best_booster", value=lgbm_model) res = np.mean(losses) timer(start_time) return res
grid_df = grid_df[preds_mask].reset_index(drop=True) keep_cols = [col for col in list(grid_df) if '_tmp_' not in col] grid_df = grid_df[keep_cols] d_sales = grid_df[['d', 'sales']] substitute = d_sales['sales'].values substitute[(d_sales['d'] > END_TRAIN)] = np.nan grid_df['sales'] = substitute grid_df.to_pickle(processed_data_dir + 'test_' + store_id + '_' + state_id + '.pkl') del grid_df, d_sales, substitute seed_everything(SEED) estimator = lgb.train(lgb_params, train_data, valid_sets=[valid_data], verbose_eval=100) # display(pd.DataFrame({'name':estimator.feature_name(), # 'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25)) model_name = model_dir + 'lgb_model_' + store_id + '_' + state_id + '_v' + str( VER) + '.bin' pickle.dump(estimator, open(model_name, 'wb')) del train_data, valid_data, estimator gc.collect() MODEL_FEATURES = features_columns
'num_class': 4, 'boosting': 'gbdt', 'metric': 'multi_logloss' } losses = [] for i, (train_index, valid_index) in enumerate(rskf.split(Xtrn, y)): X_A, X_B = Xtrn.iloc[train_index, :], Xtrn.iloc[valid_index, :] y_A, y_B = y.iloc[train_index], y.iloc[valid_index] lgb_train = lgb.Dataset(X_A, y_A) lgb_valid = lgb.Dataset(X_B, y_B, reference=lgb_train) lgbm_model = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid_0'], num_boost_round=10000, verbose_eval = 50, # https://tinyurl.com/yhdmtdm8 early_stopping_rounds=20, # callbacks=[pruning_callback] ) y_oof = lgbm_model.predict(X_B) # not needed, num_iteration=lgbm_model.best_iteration losses.append(log_loss(y_B, y_oof)) print(np.mean(losses)) # %% score = np.mean(losses) model_pickle = save_trained_classifier(best_model, 'lgbm_8f5r_integration_vallina', score, save_directory) model_pickle = '/kaggle/working/may_model/202105271625_lgbm_1min' lgbm_pickle = pickle.load(open(model_pickle, 'rb')) lgbm_pickle.predict(Xtst)
params = { 'objective': 'multiclass', 'num_class': 4, 'metric': 'None', 'max_bin': 50, 'num_leaves': 20, 'lambda_l2': 0.1, 'verbose': -1, 'seed': seed } # train model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=10, feval=metric_f1) # evaluate y_val_pred = np.argmax(model.predict(X_valid), axis=1) score = f1_score(y_valid, y_val_pred, average='macro') scores.append(score) print(f"\nFold-{i+1}: Score: {score:.4f}\n") # predict test y_test_pred += model.predict( X_test, num_iteration=model.best_iteration) / n_folds # evaluate
def __call__(self, trial): # type: (Trial) -> float pbar_fmt = "{}, val_score: {:.6f}" if self.pbar is not None: self.pbar.set_description(pbar_fmt.format(self.step_name, self.best_score)) if "lambda_l1" in self.target_param_names: self.lgbm_params["lambda_l1"] = trial.suggest_loguniform("lambda_l1", 1e-8, 10.0) if "lambda_l2" in self.target_param_names: self.lgbm_params["lambda_l2"] = trial.suggest_loguniform("lambda_l2", 1e-8, 10.0) if "num_leaves" in self.target_param_names: tree_depth = self.lgbm_params.get("max_depth", DEFAULT_TUNER_TREE_DEPTH) max_num_leaves = 2 ** tree_depth if tree_depth > 0 else 2 ** DEFAULT_TUNER_TREE_DEPTH self.lgbm_params["num_leaves"] = trial.suggest_int("num_leaves", 2, max_num_leaves) if "feature_fraction" in self.target_param_names: # `GridSampler` is used for sampling feature_fraction value. # The value 1.0 for the hyperparameter is always sampled. param_value = min(trial.suggest_uniform("feature_fraction", 0.4, 1.0 + EPS), 1.0) self.lgbm_params["feature_fraction"] = param_value if "bagging_fraction" in self.target_param_names: # `TPESampler` is used for sampling bagging_fraction value. # The value 1.0 for the hyperparameter might by sampled. param_value = min(trial.suggest_uniform("bagging_fraction", 0.4, 1.0 + EPS), 1.0) self.lgbm_params["bagging_fraction"] = param_value if "bagging_freq" in self.target_param_names: self.lgbm_params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 7) if "min_child_samples" in self.target_param_names: # `GridSampler` is used for sampling min_child_samples value. # The value 1.0 for the hyperparameter is always sampled. param_value = int(trial.suggest_uniform("min_child_samples", 5, 100 + EPS)) self.lgbm_params["min_child_samples"] = param_value start_time = time.time() booster = lgb.train(self.lgbm_params, self.train_set, **self.lgbm_kwargs) val_score = self._get_booster_best_score(booster) elapsed_secs = time.time() - start_time average_iteration_time = elapsed_secs / booster.current_iteration() if self.model_dir is not None: path = os.path.join(self.model_dir, "{}.pkl".format(trial.number)) with open(path, "wb") as fout: pickle.dump(booster, fout) _logger.info("The booster of trial#{} was saved as {}.".format(trial.number, path)) if self.compare_validation_metrics(val_score, self.best_score): self.best_score = val_score self.best_booster_with_trial_number = (booster, trial.number) if self.pbar is not None: self.pbar.set_description(pbar_fmt.format(self.step_name, self.best_score)) self.pbar.update(1) self.report.append( dict( # Since v1.2.0, action was concatenation of parameter names. Currently, it is # explicitly given to distinguish steps which tune the same parameters. action=self.step_name, trial=self.trial_count, value=str(trial.params), val_score=val_score, elapsed_secs=elapsed_secs, average_iteration_time=average_iteration_time, ) ) trial.set_system_attr(_ELAPSED_SECS_KEY, elapsed_secs) trial.set_system_attr(_AVERAGE_ITERATION_TIME_KEY, average_iteration_time) trial.set_system_attr(_STEP_NAME_KEY, self.step_name) trial.set_system_attr(_LGBM_PARAMS_KEY, json.dumps(self.lgbm_params)) self.trial_count += 1 return val_score
scores = [] t0 = time.time() train_preds = np.zeros(train.shape[0]) test_preds = np.zeros((test.shape[0], 10)) # feat_imp = pd.DataFrame() kf = KFold(len(train), n_folds=10, shuffle=True, random_state=1024) for i, (train_index, test_index) in enumerate(kf): # print('第{}次训练...'.format(i)) train_feat1 = train.iloc[train_index] train_feat2 = train.iloc[test_index] lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖']) lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖']) gbm = lgb.train(params, lgb_train1, num_boost_round=3000, valid_sets=lgb_train2, verbose_eval=False, feval=evalerror, early_stopping_rounds=50) feat_i = pd.DataFrame( pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)) # feat_imp = pd.concat([feat_imp, feat_i],axis=1) train_preds[test_index] += gbm.predict( train_feat2[predictors], num_iteration=gbm.best_iteration) test_preds[:, i] = gbm.predict(test[predictors], num_iteration=gbm.best_iteration) # print(feat_imp) # feat_imp.to_csv("./feature_imp.csv",header=False) print('线下得分:{0}, min_data: {1}'.format( (mean_squared_error(train['血糖'], train_preds) * 0.5),
y_valid = Y_TEST early_stopping_rounds = 20 num_boost_round = 3500 metric = 'auc' params['metric'] = metric #======================================================================== # Fitting #======================================================================== lgb_train = lgb.Dataset(data=x_train, label=y_train) lgb_valid = lgb.Dataset(data=x_valid, label=y_valid) with timer(" * Train & Validation"): estimator = lgb.train(params=params, train_set=lgb_train, valid_sets=lgb_valid, early_stopping_rounds=early_stopping_rounds, num_boost_round=num_boost_round, verbose_eval=200) best_iter = estimator.best_iteration oof_pred = estimator.predict(x_valid) score = roc_auc_score(y_valid, oof_pred) cvs = str(score).replace('.', '-') feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns) feim.sort_values(by='importance', ascending=False, inplace=True) feim['is_valid'] = feim['feature'].map(valid_map) #======================================================================== # PostProcess #========================================================================
"lambda_l1": 0.1, "verbosity": -1 } folds = KFold(n_splits=5, shuffle=True, random_state=2018) oof_lgb = np.zeros(len(train)) predictions_lgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)): print("fold n°{}".format(fold_ + 1)) trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx]) val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=200, early_stopping_rounds=100) oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration) predictions_lgb += clf.predict( X_test, num_iteration=clf.best_iteration) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target))) ##### xgb xgb_params = { 'eta': 0.005, 'max_depth': 11, 'subsample': 0.85,
def fit(self, X_train=None, Y_train=None, X_test=None, Y_test=None, dataset_train=None, dataset_val=None, time_limit=None, **kwargs): start_time = time.time() params = self.params.copy() # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored. verbosity = kwargs.get('verbosity', 2) params = fixedvals_from_searchspaces(params) if verbosity <= 1: verbose_eval = False elif verbosity == 2: verbose_eval = 1000 elif verbosity == 3: verbose_eval = 50 else: verbose_eval = 1 eval_metric = self.get_eval_metric() dataset_train, dataset_val = self.generate_datasets( X_train=X_train, Y_train=Y_train, params=params, X_test=X_test, Y_test=Y_test, dataset_train=dataset_train, dataset_val=dataset_val) gc.collect() num_boost_round = params.pop('num_boost_round', 1000) logger.log( 15, 'Training Gradient Boosting Model for %s rounds...' % num_boost_round) logger.log(15, "with the following hyperparameter settings:") logger.log(15, params) num_rows_train = len(dataset_train.data) if 'min_data_in_leaf' in params: if params[ 'min_data_in_leaf'] > num_rows_train: # TODO: may not be necessary params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0)) # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time if (dataset_val is not None) and (dataset_train is not None): if num_rows_train <= 10000: modifier = 1 else: modifier = 10000 / num_rows_train early_stopping_rounds = max(round(modifier * 150), 10) else: early_stopping_rounds = 150 callbacks = [] valid_names = ['train_set'] valid_sets = [dataset_train] if dataset_val is not None: reporter = kwargs.get('reporter', None) if reporter is not None: train_loss_name = self._get_train_loss_name() else: train_loss_name = None callbacks += [ early_stopping_custom(early_stopping_rounds, metrics_to_use=[('valid_set', self.eval_metric_name)], max_diff=None, start_time=start_time, time_limit=time_limit, ignore_dart_warning=True, verbose=False, manual_stop_file=False, reporter=reporter, train_loss_name=train_loss_name), ] valid_names = ['valid_set'] + valid_names valid_sets = [dataset_val] + valid_sets seed_val = params.pop('seed_value', 0) train_params = { 'params': params, 'train_set': dataset_train, 'num_boost_round': num_boost_round, 'valid_sets': valid_sets, 'valid_names': valid_names, 'callbacks': callbacks, 'verbose_eval': verbose_eval, } if not isinstance(eval_metric, str): train_params['feval'] = eval_metric if seed_val is not None: train_params['params']['seed'] = seed_val random.seed(seed_val) np.random.seed(seed_val) # Train LightGBM model: try_import_lightgbm() import lightgbm as lgb self.model = lgb.train(**train_params) self.params_trained['num_boost_round'] = self.model.best_iteration
params = { 'learning_rate': p['lgb_lr'], # caution: params dict is modified by lgb 'application': 'regression', 'max_depth': p['lgb_max_depth'], 'num_leaves': p['lgb_num_leaves'], 'verbosity': -1, 'metric': 'RMSE', } print("Fitting boosted trees") model_gb = lgb.train(params, train_set=d_train, num_boost_round=p['lgb_num_trees'], valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=0, callbacks=[]) print("Evaluating model") preds_gb = np.array(model_gb.predict(X_valid)) preds = preds_gb score = mean_squared_log_error(y_valid, preds)**0.5 if score < best_scores[0]: best_scores = (score, ) best_params = p best_model_gb = model_gb print('Best score: {}'.format(best_scores[0]))
#"num_leaves": 10 } # In[ ]: # model = lgb.train( # params, lgb_train, # #valid_sets=[lgb_train], # verbose_eval=1, # num_boost_round=1, # early_stopping_rounds=8, # ) # In[ ]: lgb.train(params, lgb.Dataset(pd.DataFrame({'x': [1]}), [1], categorical_feature=None)) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: pd.set_option('display.max_columns', 100) print(train[train.user_id == 4421282].sort_values('timestamp').iloc[0:100])
feature_importance_df = pd.DataFrame() for fold, (train_idx, val_idx) in enumerate(folds.split(train)): print(f"Fold {fold+1}") train_data = lgb.Dataset(train.iloc[train_idx][use_cols], label=log_target[train_idx], categorical_feature=categorical_cols) val_data = lgb.Dataset(train.iloc[val_idx][use_cols], label=log_target[val_idx], categorical_feature=categorical_cols) num_round = N_ROUNDS callbacks = [log_evaluation(logger, period=100)] clf = lgb.train(params, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100, callbacks=callbacks) oof[val_idx] = clf.predict(train[use_cols].values[val_idx], num_iteration=clf.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = use_cols fold_importance_df["importance"] = clf.feature_importance( importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) feature_importance_df = feature_importance_df[[ "feature", "importance"
print('Fold:', fold_n + 1) pd.options.mode.chained_assignment = None result_table['fold'].loc[fold_n] = fold_n + 1 X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] y_valid_coll[valid_index] = y_valid dtrain = lgb.Dataset(X_train, label=y_train) dvalid = lgb.Dataset(X_valid, label=y_valid) clf = lgb.train(params, dtrain, 10000, valid_sets=[dtrain, dvalid], verbose_eval=4) feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance() y_pred_valid = clf.predict(X_valid) print(round((log_loss(y_valid, y_pred_valid)), 2)) result_table['log_loss'].loc[fold_n] = log_loss(y_valid, y_pred_valid) y_oof[valid_index] = y_pred_valid y_preds += clf.predict(test_df) / NFOLDS del X_train, X_valid, y_train, y_valid
sub = pd.DataFrame() sub['id'] = test_id sub['target'] = np.zeros_like(test_id) # lgb params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10, 'objective': 'binary', 'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10, 'min_data': 500} for i, (train_index, test_index) in enumerate(skf.split(X, y)): print('[Fold %d/%d]' % (i + 1, kfold)) X_train, X_eval = X[train_index], X[test_index] y_train, y_eval = y[train_index], y[test_index] lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), 2000, lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, feval=gini_lgb, early_stopping_rounds=100) print('[Fold %d/%d Prediciton:]' % (i + 1, kfold)) sub['target'] += lgb_model.predict(test.values, num_iteration=lgb_model.best_iteration) / (kfold) gc.collect() #for i, (train_index, test_index) in enumerate(skf.split(X, y)): # print('[Fold %d/%d]' % (i + 1, kfold)) # X_train, X_valid = X[train_index], X[test_index] # y_train, y_valid = y[train_index], y[test_index] # # Convert our data into XGBoost format # d_train = xgb.DMatrix(X_train, y_train) # d_valid = xgb.DMatrix(X_valid, y_valid) # d_test = xgb.DMatrix(test.values) # watchlist = [(d_train, 'train'), (d_valid, 'valid')]
params['learning_rate'] = 0.0021 # shrinkage_rate params['boosting_type'] = 'gbdt' params['objective'] = 'regression' params['metric'] = 'l1' # or 'mae' params['sub_feature'] = 0.345 params['bagging_fraction'] = 0.85 # sub_row params['bagging_freq'] = 40 params['num_leaves'] = 512 # num_leaf params['min_data'] = 500 # min_data_in_leaf params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf params['verbose'] = 0 params['feature_fraction_seed'] = 2 params['bagging_seed'] = 3 print("\nFitting LightGBM model ...") clf = lgb.train(params, d_train, 430) del d_train; gc.collect() del x_train; gc.collect() print("\nPrepare for LightGBM prediction ...") print(" Read sample file ...") sample = pd.read_csv('../data/sample_submission.csv') print(" ...") sample['parcelid'] = sample['ParcelId'] print(" Merge with property data ...") df_test = sample.merge(prop, on='parcelid', how='left') print(" ...") del sample, prop; gc.collect() print(" ...") #df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']