def test_cv(self): lgb_train, _ = template.test_template(return_data=True) lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, shuffle=False, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) tss = TimeSeriesSplit(3) lgb.cv( {'verbose': -1}, lgb_train, num_boost_round=20, data_splitter=tss, nfold=5, # test if wrong nfold is ignored metrics='l2', verbose_eval=False)
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5, model_type='lgb'): if 'sample_weight' not in data.keys(): data['sample_weight'] = 1 model.random_state = random_state predict_label = 'predict_' + label kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) data[predict_label] = 0 test_index = (data[label].isnull()) | (data[label] == -1) #找到要预测的数据集 train_data = data[~test_index].reset_index(drop=True) #分割出预测集训练集 test_data = data[test_index] for train_idx, val_idx in kfold.split(train_data): model.random_state = model.random_state + 1 train_x = train_data.loc[train_idx][feature] train_y = train_data.loc[train_idx][label] test_x = train_data.loc[val_idx][feature] test_y = train_data.loc[val_idx][label] if model_type == 'lgb': try: model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=400, eval_metric='mae', callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))], categorical_feature=cate_feature, sample_weight=train_data.loc[train_idx]['sample_weight'], verbose=100) except: model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=200, eval_metric='mae', callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))], categorical_feature=cate_feature, sample_weight=train_data.loc[train_idx]['sample_weight'], verbose=100) elif model_type == 'ctb': model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=200, # eval_metric='mae', # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))], cat_features=cate_feature, sample_weight=train_data.loc[train_idx]['sample_weight'], verbose=100) train_data.loc[val_idx, predict_label] = model.predict(test_x) if len(test_data) != 0: #预测集的预测 test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature]) test_data[predict_label] = test_data[predict_label] / n_splits # print((train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(), # test_data[predict_label].mean()) # print('########################################') return pd.concat([train_data, test_data], sort=True, ignore_index=True), predict_label
def test_lightgbm_ranking(): try: import lightgbm except: print("Skipping test_lightgbm_ranking!") return import shap import numpy as np # train lightgbm ranker model x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank() model = lightgbm.LGBMRanker() model.fit( x_train, y_train, group=q_train, eval_set=[(x_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[ lightgbm.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1) ]) _validate_shap_values(model, x_test)
def test_lambdarank(self): X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, callbacks=[ lgb.reset_parameter( learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x)) ]) self.assertLessEqual(gbm.best_iteration_, 25) self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333) self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
def test_lambdarank(self): X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1) ])
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args): from lightgbm import Dataset if args.clip_target != -1: y_tr = y_tr.clip(upper=args.clip_target) tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False) if args.mode not in ['full', 'fold']: va_ds = Dataset(x_va, label=y_va, free_raw_data=False) valid_sets = [tr_ds, va_ds] else: valid_sets = [tr_ds] params = { 'learning_rate': 0.02, 'max_depth': -1, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'num_leaves': args.num_leaves, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'lambda_l2': 0.7, 'bagging_freq': 5, 'seed': 42 } kwargs = { 'train_set': tr_ds, 'categorical_feature': cat_feats, 'verbose_eval': args.verbose_eval, 'num_boost_round': args.num_boost_round, } if args.mode not in ['full', 'fold']: kwargs['early_stopping_rounds'] = 200 kwargs['valid_sets'] = valid_sets if args.lr_decay: kwargs['callbacks'] = [ lgb.reset_parameter( learning_rate=learning_rate_010_decay_power_0995) ] m = lgb.train(params, **kwargs) tr_pred = np.clip(m.predict(tr_ds.data), 0, 361) tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label)) if args.mode not in ['full', 'fold']: va_pred = np.clip(m.predict(va_ds.data), 0, 361) va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label)) else: va_score = 0. return m, tr_score, va_score
def test_lambdarank(self): X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
def test_cv(self): lgb_train, _ = template.test_template(return_data=True) lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ])
def test_reset_parameter_callback_is_picklable(serializer): params = { 'bagging_fraction': [0.7] * 5 + [0.6] * 5, 'feature_fraction': reset_feature_fraction } callback = lgb.reset_parameter(**params) callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer) assert callback_from_disk.order == 10 assert callback_from_disk.before_iteration is True assert callback.kwargs == callback_from_disk.kwargs assert callback.kwargs == params
def test_lambdarank(self): X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train') X_test, y_test = load_svmlight_file('../../examples/lambdarank/rank.test') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') q_test = np.loadtxt('../../examples/lambdarank/rank.test.query') lgb_model = lgb.LGBMRanker().fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1], verbose=False, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def lightgbm_fitparams(**kwargs): fit_params = { "early_stopping_rounds": 30, "eval_metric": 'auc', "eval_set": [(None, None)], 'eval_names': ['valid'], 'callbacks': [lgbm.reset_parameter(learning_rate=lr_decayp())], 'verbose': 100, 'categorical_feature': 'auto' } for key, value in kwargs.items(): fit_params[key] = value return fit_params
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) # self defined data_splitter tss = TimeSeriesSplit(3) lgb.cv( params, lgb_train, num_boost_round=10, data_splitter=tss, nfold=5, # test if wrong nfold is ignored metrics='l2', verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file( '../../examples/lambdarank/rank.train') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, metrics='l2', verbose_eval=False)
def test_xendcg(self): dir_path = os.path.dirname(os.path.realpath(__file__)) X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train')) X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test')) q_train = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.train.query')) q_test = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.test.query')) gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1) gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, eval_metric='ndcg', callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) self.assertLessEqual(gbm.best_iteration_, 24) self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579) self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) self.assertIn('l1-mean', cv_res) self.assertNotIn('l2-mean', cv_res) self.assertEqual(len(cv_res['l1-mean']), 10) # shuffle = True, callbacks cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) self.assertIn('l1-mean', cv_res) self.assertEqual(len(cv_res['l1-mean']), 10) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, verbose_eval=False) cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss, verbose_eval=False) np.testing.assert_almost_equal(cv_res_gen['l2-mean'], cv_res_obj['l2-mean']) # lambdarank X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) # ... with l2 metric cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, metrics='l2', verbose_eval=False) self.assertEqual(len(cv_res_lambda), 2) self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any()) # ... with NDCG (default) metric cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, verbose_eval=False) self.assertEqual(len(cv_res_lambda), 2) self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any()) # self defined folds with lambdarank cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, folds=GroupKFold(n_splits=3), verbose_eval=False) np.testing.assert_almost_equal(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
def fit(self, X, Y, eval_set=[]): self.classes_ = unique_labels(Y) self.X_ = X self.y_ = Y self.gbdt.fit(X, Y, eval_set=eval_set, eval_metric=lambda y_true, y_pred: [ self._focal_eval(y_true, y_pred), self._f1_score(y_true, y_pred) ], verbose=100, callbacks=[lgb.reset_parameter(learning_rate=self._lr_linear_cosine_decay)] ) return self
def get_model(train_x, train_y, valid_x, valid_y, num_class, best_params) -> t.Any: best_params = { 'lambda_l1': 5.96, 'lambda_l2': 1.1, 'num_leaves': 12, 'feature_fraction': 0.75, 'bagging_fraction': 0.89, 'bagging_freq': 7, #'min_child_sample': 100 } # 学習用データセット train_set = lgb.Dataset(train_x, train_y, free_raw_data=False) # 評価用データセット valid_set = lgb.Dataset(valid_x, valid_y, free_raw_data=False) evals_result = {} params = { 'objective': 'multiclass', 'metric': 'multi_logloss', 'boosting_type': 'gbdt', 'num_class': num_class, **best_params } model = lgb.train( params=params, train_set=train_set, valid_sets=[valid_set, train_set], num_boost_round=1000, early_stopping_rounds=100, verbose_eval=10, # learning_rates=lambda iter: 0.1 * (0.99 ** iter), callbacks=[ lgb.reset_parameter(learning_rate=[0.2] * 400 + [0.1] * 400 + [0.05] * 200) ], evals_result=evals_result, ) importance = pd.DataFrame(model.feature_importance(), index=train_x.columns, columns=['importance' ]).sort_values('importance', ascending=[False]) print(importance.head(50)) return model, evals_result
def cross_validate(param=dict(n_estimators=1000, metric="map", colsample_bytree=0.2, max_depth=7, importance_type="gain"), n_folds=5, target="satisfied"): train_users = big_table["user_id"].unique() folds = KFold(n_folds, shuffle=True, random_state=42) models = [] test_pred = np.zeros(test_big_table.shape[0]) scores = [] for idx, (train_idx, valid_idx) in enumerate(folds.split(train_users)): t_user = train_users[train_idx] v_user = train_users[valid_idx] train_data = big_table[big_table["user_id"].isin(t_user)] valid_data = big_table[big_table["user_id"].isin(v_user)] train_group = train_data.groupby( "user_id", as_index=False).count()["satisfied"].values valid_group = valid_data.groupby( "user_id", as_index=False).count()["satisfied"].values test_group = test_big_table.groupby( "user_id", as_index=False).count()["jd_no"].values result = feature_select(target, train_data, valid_data, test_big_table) t_x, t_y = result[0] v_x, v_y = result[1] test_x, _ = result[2] model = lgb.LGBMRanker(**param) print("Fold", idx, "-" * 30) model.fit( t_x, t_y, group=train_group, eval_set=[(t_x, t_y), (v_x, v_y)], eval_group=[train_group, valid_group], early_stopping_rounds=100, verbose=10, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.01)]) models.append(model) test_pred += model.predict(test_x) / n_folds scores.append(model.best_score_["valid_1"]["ndcg@1"]) print("mean score", np.mean(scores)) return models, test_pred
def test_cv(self): lgb_train, _ = template.test_template(return_data=True) # shuffle = False lgb.cv({'verbose': -1}, lgb_train, num_boost_round=10, nfold=3, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv({'verbose': -1}, lgb_train, num_boost_round=10, nfold=3, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) # self defined data_splitter tss = TimeSeriesSplit(3) lgb.cv( {'verbose': -1}, lgb_train, num_boost_round=10, data_splitter=tss, nfold=5, # test if wrong nfold is ignored metrics='l2', verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file( '../../examples/lambdarank/rank.train') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') params = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train, params=params) lgb.cv(params, lgb_train, num_boost_round=20, nfold=3, metrics='l2', verbose_eval=False)
def train_lgb_rank_model(ranker, train_data, test_data): """ """ X_train, y_train, q_train = train_data X_test, y_test, q_test = test_data ranker.fit( X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=True, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.9**x * 0.1)], ) return ranker
def train(self): global MODEL_PATH lgb_train, lgb_test = self.samples() def learning_rate(epoch, span=100): cycle = [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.005] lr = cycle[(epoch // span) % len(cycle)] print(f"LEARN RATE = {lr}") return lr gbm = lgb.train( self.params, lgb_train, num_boost_round=4000, # +oo valid_sets=lgb_test, # init_model=MODEL_PATH, early_stopping_rounds=500, # 5000 callbacks=[lgb.reset_parameter(learning_rate=learning_rate)], ) gbm.save_model(MODEL_PATH) self.load()
def test_joblib(self): X, y = load_boston(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, silent=True, importance_type='split') gbm.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=mse, early_stopping_rounds=5, verbose=False, callbacks=[ lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1))) ]) joblib.dump(gbm, 'lgb.pkl') # test model with custom functions gbm_pickle = joblib.load('lgb.pkl') self.assertIsInstance(gbm_pickle.booster_, lgb.Booster) self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params()) np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_) self.assertAlmostEqual(gbm_pickle.learning_rate, 0.1) self.assertTrue(callable(gbm_pickle.objective)) for eval_set in gbm.evals_result_: for metric in gbm.evals_result_[eval_set]: np.testing.assert_allclose( gbm.evals_result_[eval_set][metric], gbm_pickle.evals_result_[eval_set][metric]) pred_origin = gbm.predict(X_test) pred_pickle = gbm_pickle.predict(X_test) np.testing.assert_allclose(pred_origin, pred_pickle)
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
for train_index, val_index in kf.split(dataset_values, dataset_target): i += 1 val_target = dataset_target.iloc[val_index] val_val = dataset_values.iloc[val_index] train_target = dataset_target.iloc[train_index] train_val = dataset_values.iloc[train_index] # add extra columns missings in the test set for col in train_val.columns: if col not in val_val.columns: val_val[col] = 0 val_val = val_val[train_val.columns] clf = lgb.LGBMClassifier() clf.set_params(**params) clf.fit(train_val.values, train_target, callbacks=[lgb.reset_parameter(learning_rate=lr_decay)]) y_pred = clf.predict(val_val) correct = len([i for i, j in zip(y_pred, val_target) if i == j]) ratio = correct/len(val_target)*100 print(f"Accuracy f-{i}: {ratio:.3f}") preds += array(clf.predict_proba(test_val)) preds = argmax(preds, axis=1) submission = pd.read_csv(PATH_DATA + 'SubmissionFormat.csv') labels = ["non functional", "functional needs repair", "functional"] submission['status_group'] = list(map(lambda x: labels[x], preds)) submission.to_csv(PATH_DATA + "submission.csv", index=False)
from optuna import structs from optuna import study as study_module from sklearn.datasets import load_breast_cancer from sklearn.model_selection import GroupKFold from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from optgbm.sklearn import OGBMClassifier from optgbm.sklearn import OGBMRegressor from optgbm.sklearn import _VotingBooster n_estimators = 10 n_trials = 5 random_state = 0 callback = lgb.reset_parameter( learning_rate=lambda iteration: 0.05 * (0.99**iteration)) early_stopping_rounds = 3 def log_likelihood(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: y_pred = 1.0 / (1.0 + np.exp(-y_pred)) return y_pred - y_true, y_pred * (1.0 - y_pred) def zero_one_loss(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[str, np.number, bool]: return "zero_one_loss", np.mean(y_true != y_pred), False
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, learning_rates=lambda iter: 0.05 * (0.99 ** iter), valid_sets=lgb_eval) print('Finish 20 - 30 rounds with decay learning rates...') # change other parameters during training gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, valid_sets=lgb_eval, callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) print('Finish 30 - 40 rounds with changing bagging_fraction...') # self-defined objective function # f(preds: array, train_data: Dataset) -> grad: array, hess: array # log likelihood loss def loglikelood(preds, train_data): labels = train_data.get_label() preds = 1. / (1. + np.exp(-preds)) grad = preds - labels hess = preds * (1. - preds) return grad, hess
def test_param(lgbm_param): print('in test_param') gc.collect() global y_train_new, X_train_new, y_val_new, X_val_new # val_num = int(0.2*len(train_X)) # X_train_new = train_X.iloc[val_num:,:] # y_train_new = train_y[val_num:] # X_val_new = train_X.iloc[:val_num,:] # y_val_new = train_y.iloc[:val_num] # print('start train_test_split') # X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(train_X, # train_y, test_size=0.25, random_state=SEED) start_t = time.time() lgbm = lgb.LGBMClassifier(**lgbm_param) learning_rate_func = lgb.reset_parameter( learning_rate=generate_learning_rate_list()) print('start partial trainning') lgbm.fit( X_train_new, y_train_new, eval_set=[(X_train_new, y_train_new), (X_val_new, y_val_new)], callbacks=[learning_rate_func], # eval_metric=log_loss_def, eval_metric='logloss', # eval_metric='auc', verbose=100, early_stopping_rounds=300) print('partial fit cost time: ', time.time() - start_t) best_iteration = lgbm.best_iteration_ # print('best score value is ', lgbm.best_score_) # logloss_val = round(lgbm.best_score_['valid_1']['auc'], 5) logloss_val = round(lgbm.best_score_['valid_1']['binary_logloss'], 5) val_click_prob = lgbm.predict_proba(X_val_new)[:, 1] # val_click_prob_new = managed_change(val_click_prob) print('after managed_change logloss is ', log_loss(y_val_new, val_click_prob), log_loss(y_val_new, managed_change(val_click_prob, ratio=0.01)), log_loss(y_val_new, managed_change(val_click_prob, ratio=0.02)), log_loss(y_val_new, managed_change(val_click_prob, ratio=0.05)), log_loss(y_val_new, managed_change(val_click_prob, ratio=0.08)), log_loss(y_val_new, managed_change(val_click_prob, ratio=0.10))) print('after managed_change auc is ', roc_auc_score(y_val_new, val_click_prob), roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.01)), roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.02)), roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.05)), roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.08)), roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.10))) start_t = time.time() prediction_click_prob = lgbm.predict_proba(test_X)[:, 1] outcome_df['predicted_score'] = prediction_click_prob param_md5_str = convert_2_md5(lgbm_param) store_path = 'C:/D_Disk/data_competition/xunfei_ai_ctr/outcome/' partial_file_name = '_'.join( ['submission_partial', str(logloss_val), param_md5_str]) + '.csv' full_file_name = '_'.join( ['submission_full', str(logloss_val), param_md5_str]) + '.csv' outcome_df['predicted_score'].to_csv(store_path + partial_file_name, header=['predicted_score']) print('partial get predict outcome cost time: ', time.time() - start_t) del lgbm gc.collect() del X_train_new, X_val_new gc.collect() del y_train_new, y_val_new gc.collect() for i in range(5): gc.collect() # start_t = time.time() # lgbm_param['n_estimators'] = int(best_iteration*1.0) # print('normal full fit n_estimators is ', int(best_iteration*1.0)) # lgbm = lgb.LGBMClassifier(**lgbm_param) # lgbm.fit(train_X, train_y) # print('normal full fit cost time: ', time.time()-start_t) # # start_t = time.time() # prediction_click_prob = lgbm.predict_proba(test_X)[:,1] # outcome_df['predicted_score'] = prediction_click_prob # outcome_df['predicted_score'].to_csv(store_path+full_file_name, # header=['predicted_score']) # print('normal full predict cost time: ', time.time()-start_t) start_t = time.time() lgbm_param['n_estimators'] = int(best_iteration * 1.1) print('extra full fit n_estimators is ', int(best_iteration * 1.1)) lgbm = lgb.LGBMClassifier(**lgbm_param) learning_rate_func = lgb.reset_parameter( learning_rate=generate_learning_rate_list() [:lgbm_param['n_estimators']]) lgbm.fit(train_X, train_y, callbacks=[learning_rate_func]) print('extra full fit cost time: ', time.time() - start_t) start_t = time.time() prediction_click_prob = lgbm.predict_proba(test_X)[:, 1] outcome_df['predicted_score'] = prediction_click_prob outcome_df['predicted_score'].to_csv(store_path + full_file_name, header=['predicted_score']) print('extra full predict cost time: ', time.time() - start_t) write_to_log('-' * 25, ' md5 value: ', param_md5_str, '-' * 25) write_to_log('param: ', lgbm_param) write_to_log('best_iteration: ', best_iteration) write_to_log('valid rmse: ', logloss_val) write_to_log('-' * 80 + '\n')
tune_params = { 'n_estimators': [200, 500, 1000, 2500, 5000], 'max_depth': sp_randint(4, 12), 'colsample_bytree': sp_uniform(loc=0.8, scale=0.15), 'min_child_samples': sp_randint(60, 120), 'subsample': sp_uniform(loc=0.75, scale=0.25), 'reg_lambda': [1e-3, 1e-2, 1e-1, 1] } fit_params = { 'early_stopping_rounds': 40, 'eval_metric': 'accuracy', 'eval_set': [(X_train, y_train), (X_val, y_val)], 'verbose': 20, 'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)] } lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary', random_state=1) gs = RandomizedSearchCV(estimator=lgb_clf, param_distributions=tune_params, n_iter=40, scoring='f1', cv=5, refit=True, random_state=1, verbose=True) lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='multiclass', random_state=100) opt_params = {
reg_alpha=0.0, reg_lambda=100.0, scale_pos_weight=1.0, subsample=1.0, subsample_freq=1, random_state=n_fold) clf.fit( trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=1000, early_stopping_rounds=600, callbacks=[ lgb.reset_parameter( learning_rate=[200 / (8000 + x) for x in range(10000)]) ], # categorical_feature=CATEGORICAL_COLUMNS #30 categorical_feature=small_cat #30 ) oof_preds[val_idx] = clf.predict_proba( val_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += rankdata( clf.predict_proba(test[feats], num_iteration=clf.best_iteration_) [:, 1]) / folds.n_splits / len(sub_preds) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1
base_learning_rate = 0.1 lr = base_learning_rate * np.power(.995, current_iter) return lr if lr > 1e-3 else 1e-3 def learning_rate_005_decay_power_099(current_iter): base_learning_rate = 0.05 lr = base_learning_rate * np.power(.99, current_iter) return lr if lr > 1e-3 else 1e-3 #setup lgb parameters fit_params={"early_stopping_rounds":30, "eval_metric" : 'mae', "eval_set" : [(X_test,y_test)], 'eval_names': ['valid'], 'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': -1, 'categorical_feature': 'auto', } #starting parameters param_test ={ # ============================================================================= # 'num_leaves': sp_randint(100, 1000), # 'max_depth': sp_randint(1, 10), # 'min_data_in_leaf': sp_randint(1, 100), # ============================================================================= # ============================================================================= # 'min_child_samples': sp_randint(100, 1000), # 'min_child_weight': sp_uniform(loc=0, scale=1.0),#[1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], # 'subsample': sp_uniform(loc=0.2, scale=0.8),
print(f'{i}:{attr} FOLD:{fold}') X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] ys.append(y_valid) #print(X_train.shape, y_train.shape) fit_params={"early_stopping_rounds":300, "eval_metric" : evaluate_macroF1_lgb, "eval_set" : [(X_valid,y_valid)], 'eval_names': ['valid'], #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': False, 'categorical_feature': 'auto'} fit_params['callbacks'] = [lgb.reset_parameter(learning_rate=learning_rate_power_0997)] opt_parameters = { #'colsample_bytree': 0.9221304051471293, 'min_child_samples': 150, 'num_leaves': 2, #'subsample': 0.9510118790770111, 'class_weight': 'balanced', 'lambda_l1': 1.79, 'lambda_l2': 1.71, 'num_trees': 2000 } #clf_final = lgb.LGBMClassifier(**clf.get_params()) #clf_final.set_params(**opt_parameters) clf_final = lgb.LGBMClassifier(bagging_fraction=0.9957236684465528, boosting_type='gbdt', class_weight='balanced', colsample_bytree=0.7953949538181928,
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, learning_rates=lambda iter: 0.05 * (0.99 ** iter), valid_sets=lgb_eval) print('Finished 20 - 30 rounds with decay learning rates...') # change other parameters during training gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, valid_sets=lgb_eval, callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) print('Finished 30 - 40 rounds with changing bagging_fraction...') # self-defined objective function # f(preds: array, train_data: Dataset) -> grad: array, hess: array # log likelihood loss def loglikelihood(preds, train_data): labels = train_data.get_label() preds = 1. / (1. + np.exp(-preds)) grad = preds - labels hess = preds * (1. - preds) return grad, hess
"random_state": 1337 } dtrain = lgb.Dataset(X_train, label=y_train) dvalid = lgb.Dataset(X_test, label=y_test) evals_result = {} clf_lgb = lgb.train( params, dtrain, valid_sets=[dtrain, dvalid], num_boost_round=500, early_stopping_rounds=50, verbose_eval=100, feature_name=df.iloc[:, 4:].columns.tolist(), callbacks=[ lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099) ], evals_result=evals_result) y_pred1 = clf_lgb.predict(X_train, num_iteration=clf_lgb.best_iteration) y_pred1 = np.argmax(y_pred1, axis=1) y_pred2 = clf_lgb.predict(X_test, num_iteration=clf_lgb.best_iteration) y_pred2 = np.argmax(y_pred2, axis=1) score1 = accuracy_score(y_train, y_pred1) * 100 score2 = accuracy_score(y_test, y_pred2) * 100 print("\nLGB Model Report") print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2))) # hyperparameters tuning clf_lgb = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=100,
def test_cv(self): lgb_train, _ = template.test_template(return_data=True) lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])