Esempio n. 1
0
 def test_cv(self):
     lgb_train, _ = template.test_template(return_data=True)
     lgb.cv({'verbose': -1},
            lgb_train,
            num_boost_round=20,
            nfold=5,
            shuffle=False,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     lgb.cv({'verbose': -1},
            lgb_train,
            num_boost_round=20,
            nfold=5,
            shuffle=True,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     tss = TimeSeriesSplit(3)
     lgb.cv(
         {'verbose': -1},
         lgb_train,
         num_boost_round=20,
         data_splitter=tss,
         nfold=5,  # test if wrong nfold is ignored
         metrics='l2',
         verbose_eval=False)
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5,
                  model_type='lgb'):
    if 'sample_weight' not in data.keys():
        data['sample_weight'] = 1
    model.random_state = random_state
    predict_label = 'predict_' + label
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    data[predict_label] = 0
    test_index = (data[label].isnull()) | (data[label] == -1)    #找到要预测的数据集
    train_data = data[~test_index].reset_index(drop=True)     #分割出预测集训练集
    test_data = data[test_index]

    for train_idx, val_idx in kfold.split(train_data):
        model.random_state = model.random_state + 1

        train_x = train_data.loc[train_idx][feature]
        train_y = train_data.loc[train_idx][label]

        test_x = train_data.loc[val_idx][feature]
        test_y = train_data.loc[val_idx][label]
        if model_type == 'lgb':
            try:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=400,
                          eval_metric='mae',
                          callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
            except:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=200,
                          eval_metric='mae',
                          callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)

        elif model_type == 'ctb':
            model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=200,
                      # eval_metric='mae',
                      # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                      cat_features=cate_feature,
                      sample_weight=train_data.loc[train_idx]['sample_weight'],
                      verbose=100)
        train_data.loc[val_idx, predict_label] = model.predict(test_x)
        if len(test_data) != 0:                  #预测集的预测
            test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])
    test_data[predict_label] = test_data[predict_label] / n_splits
    # print((train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(),
    #       test_data[predict_label].mean())
    # print('########################################')

    return pd.concat([train_data, test_data], sort=True, ignore_index=True), predict_label
Esempio n. 3
0
def test_lightgbm_ranking():
    try:
        import lightgbm
    except:
        print("Skipping test_lightgbm_ranking!")
        return
    import shap
    import numpy as np

    # train lightgbm ranker model
    x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank()
    model = lightgbm.LGBMRanker()
    model.fit(
        x_train,
        y_train,
        group=q_train,
        eval_set=[(x_test, y_test)],
        eval_group=[q_test],
        eval_at=[1, 3],
        early_stopping_rounds=5,
        verbose=False,
        callbacks=[
            lightgbm.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1)
        ])
    _validate_shap_values(model, x_test)
Esempio n. 4
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train,
             y_train,
             group=q_train,
             eval_set=[(X_test, y_test)],
             eval_group=[q_test],
             eval_at=[1, 3],
             early_stopping_rounds=10,
             verbose=False,
             callbacks=[
                 lgb.reset_parameter(
                     learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
             ])
     self.assertLessEqual(gbm.best_iteration_, 25)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
Esempio n. 5
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train,
             y_train,
             group=q_train,
             eval_set=[(X_test, y_test)],
             eval_group=[q_test],
             eval_at=[1, 3],
             early_stopping_rounds=5,
             verbose=False,
             callbacks=[
                 lgb.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1)
             ])
Esempio n. 6
0
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args):
    from lightgbm import Dataset

    if args.clip_target != -1:
        y_tr = y_tr.clip(upper=args.clip_target)

    tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False)
    if args.mode not in ['full', 'fold']:
        va_ds = Dataset(x_va, label=y_va, free_raw_data=False)
        valid_sets = [tr_ds, va_ds]
    else:
        valid_sets = [tr_ds]

    params = {
        'learning_rate': 0.02,
        'max_depth': -1,
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'is_training_metric': True,
        'num_leaves': args.num_leaves,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'lambda_l2': 0.7,
        'bagging_freq': 5,
        'seed': 42
    }

    kwargs = {
        'train_set': tr_ds,
        'categorical_feature': cat_feats,
        'verbose_eval': args.verbose_eval,
        'num_boost_round': args.num_boost_round,
    }

    if args.mode not in ['full', 'fold']:
        kwargs['early_stopping_rounds'] = 200
        kwargs['valid_sets'] = valid_sets

    if args.lr_decay:
        kwargs['callbacks'] = [
            lgb.reset_parameter(
                learning_rate=learning_rate_010_decay_power_0995)
        ]

    m = lgb.train(params, **kwargs)

    tr_pred = np.clip(m.predict(tr_ds.data), 0, 361)
    tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label))

    if args.mode not in ['full', 'fold']:
        va_pred = np.clip(m.predict(va_ds.data), 0, 361)
        va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label))
    else:
        va_score = 0.

    return m, tr_score, va_score
Esempio n. 7
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
             eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
             callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
Esempio n. 8
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.1,
                                               random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            stratified=False,
            shuffle=False,
            metrics='l1',
            verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            stratified=False,
            shuffle=True,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric,
            lgb_train,
            num_boost_round=10,
            folds=folds,
            stratified=False,
            verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            stratified=False,
            metrics='l2',
            verbose_eval=False)
Esempio n. 9
0
 def test_cv(self):
     lgb_train, _ = template.test_template(return_data=True)
     lgb.cv({'verbose': -1},
            lgb_train,
            num_boost_round=20,
            nfold=5,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
Esempio n. 10
0
def test_reset_parameter_callback_is_picklable(serializer):
    params = {
        'bagging_fraction': [0.7] * 5 + [0.6] * 5,
        'feature_fraction': reset_feature_fraction
    }
    callback = lgb.reset_parameter(**params)
    callback_from_disk = pickle_and_unpickle_object(obj=callback,
                                                    serializer=serializer)
    assert callback_from_disk.order == 10
    assert callback_from_disk.before_iteration is True
    assert callback.kwargs == callback_from_disk.kwargs
    assert callback.kwargs == params
Esempio n. 11
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train')
     X_test, y_test = load_svmlight_file('../../examples/lambdarank/rank.test')
     q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
     q_test = np.loadtxt('../../examples/lambdarank/rank.test.query')
     lgb_model = lgb.LGBMRanker().fit(X_train, y_train,
                                      group=q_train,
                                      eval_set=[(X_test, y_test)],
                                      eval_group=[q_test],
                                      eval_at=[1],
                                      verbose=False,
                                      callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
Esempio n. 12
0
def lightgbm_fitparams(**kwargs):
    fit_params = {
        "early_stopping_rounds": 30,
        "eval_metric": 'auc',
        "eval_set": [(None, None)],
        'eval_names': ['valid'],
        'callbacks': [lgbm.reset_parameter(learning_rate=lr_decayp())],
        'verbose': 100,
        'categorical_feature': 'auto'
    }
    for key, value in kwargs.items():
        fit_params[key] = value
    return fit_params
Esempio n. 13
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.1,
                                               random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            shuffle=False,
            metrics='l1',
            verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            shuffle=True,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     # self defined data_splitter
     tss = TimeSeriesSplit(3)
     lgb.cv(
         params,
         lgb_train,
         num_boost_round=10,
         data_splitter=tss,
         nfold=5,  # test if wrong nfold is ignored
         metrics='l2',
         verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(
         '../../examples/lambdarank/rank.train')
     q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            metrics='l2',
            verbose_eval=False)
Esempio n. 14
0
 def test_xendcg(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train'))
     X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test'))
     q_train = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.train.query'))
     q_test = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.test.query'))
     gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1)
     gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
             eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
             eval_metric='ndcg',
             callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
     self.assertLessEqual(gbm.best_iteration_, 24)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)
Esempio n. 15
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
                     nfold=3, stratified=False, shuffle=False,
                     metrics='l1', verbose_eval=False)
     self.assertIn('l1-mean', cv_res)
     self.assertNotIn('l2-mean', cv_res)
     self.assertEqual(len(cv_res['l1-mean']), 10)
     # shuffle = True, callbacks
     cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
                     metrics='l1', verbose_eval=False,
                     callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     self.assertIn('l1-mean', cv_res)
     self.assertEqual(len(cv_res['l1-mean']), 10)
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds,
                         verbose_eval=False)
     cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss,
                         verbose_eval=False)
     np.testing.assert_almost_equal(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                                        '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                       '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     # ... with l2 metric
     cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
                            metrics='l2', verbose_eval=False)
     self.assertEqual(len(cv_res_lambda), 2)
     self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any())
     # ... with NDCG (default) metric
     cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
                            verbose_eval=False)
     self.assertEqual(len(cv_res_lambda), 2)
     self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any())
     # self defined folds with lambdarank
     cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
                                folds=GroupKFold(n_splits=3),
                                verbose_eval=False)
     np.testing.assert_almost_equal(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
Esempio n. 16
0
    def fit(self, X, Y, eval_set=[]):
        self.classes_ = unique_labels(Y)
        self.X_ = X
        self.y_ = Y
        self.gbdt.fit(X, Y,
            eval_set=eval_set,
            eval_metric=lambda y_true, y_pred: [
                self._focal_eval(y_true, y_pred),
                self._f1_score(y_true, y_pred)
            ],
            verbose=100,
            callbacks=[lgb.reset_parameter(learning_rate=self._lr_linear_cosine_decay)]
        )

        return self
def get_model(train_x, train_y, valid_x, valid_y, num_class,
              best_params) -> t.Any:
    best_params = {
        'lambda_l1': 5.96,
        'lambda_l2': 1.1,
        'num_leaves': 12,
        'feature_fraction': 0.75,
        'bagging_fraction': 0.89,
        'bagging_freq': 7,
        #'min_child_sample': 100
    }
    # 学習用データセット
    train_set = lgb.Dataset(train_x, train_y, free_raw_data=False)
    # 評価用データセット
    valid_set = lgb.Dataset(valid_x, valid_y, free_raw_data=False)
    evals_result = {}
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_class': num_class,
        **best_params
    }
    model = lgb.train(
        params=params,
        train_set=train_set,
        valid_sets=[valid_set, train_set],
        num_boost_round=1000,
        early_stopping_rounds=100,
        verbose_eval=10,
        # learning_rates=lambda iter: 0.1 * (0.99 ** iter),
        callbacks=[
            lgb.reset_parameter(learning_rate=[0.2] * 400 + [0.1] * 400 +
                                [0.05] * 200)
        ],
        evals_result=evals_result,
    )
    importance = pd.DataFrame(model.feature_importance(),
                              index=train_x.columns,
                              columns=['importance'
                                       ]).sort_values('importance',
                                                      ascending=[False])
    print(importance.head(50))
    return model, evals_result
Esempio n. 18
0
def cross_validate(param=dict(n_estimators=1000,
                              metric="map",
                              colsample_bytree=0.2,
                              max_depth=7,
                              importance_type="gain"),
                   n_folds=5,
                   target="satisfied"):
    train_users = big_table["user_id"].unique()
    folds = KFold(n_folds, shuffle=True, random_state=42)
    models = []
    test_pred = np.zeros(test_big_table.shape[0])
    scores = []
    for idx, (train_idx, valid_idx) in enumerate(folds.split(train_users)):
        t_user = train_users[train_idx]
        v_user = train_users[valid_idx]
        train_data = big_table[big_table["user_id"].isin(t_user)]
        valid_data = big_table[big_table["user_id"].isin(v_user)]
        train_group = train_data.groupby(
            "user_id", as_index=False).count()["satisfied"].values
        valid_group = valid_data.groupby(
            "user_id", as_index=False).count()["satisfied"].values
        test_group = test_big_table.groupby(
            "user_id", as_index=False).count()["jd_no"].values

        result = feature_select(target, train_data, valid_data, test_big_table)
        t_x, t_y = result[0]
        v_x, v_y = result[1]
        test_x, _ = result[2]
        model = lgb.LGBMRanker(**param)
        print("Fold", idx, "-" * 30)
        model.fit(
            t_x,
            t_y,
            group=train_group,
            eval_set=[(t_x, t_y), (v_x, v_y)],
            eval_group=[train_group, valid_group],
            early_stopping_rounds=100,
            verbose=10,
            callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.01)])
        models.append(model)
        test_pred += model.predict(test_x) / n_folds
        scores.append(model.best_score_["valid_1"]["ndcg@1"])
    print("mean score", np.mean(scores))
    return models, test_pred
Esempio n. 19
0
 def test_cv(self):
     lgb_train, _ = template.test_template(return_data=True)
     # shuffle = False
     lgb.cv({'verbose': -1},
            lgb_train,
            num_boost_round=10,
            nfold=3,
            shuffle=False,
            metrics='l1',
            verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv({'verbose': -1},
            lgb_train,
            num_boost_round=10,
            nfold=3,
            shuffle=True,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     # self defined data_splitter
     tss = TimeSeriesSplit(3)
     lgb.cv(
         {'verbose': -1},
         lgb_train,
         num_boost_round=10,
         data_splitter=tss,
         nfold=5,  # test if wrong nfold is ignored
         metrics='l2',
         verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(
         '../../examples/lambdarank/rank.train')
     q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
     params = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train, params=params)
     lgb.cv(params,
            lgb_train,
            num_boost_round=20,
            nfold=3,
            metrics='l2',
            verbose_eval=False)
def train_lgb_rank_model(ranker, train_data, test_data):
    """
    """
    X_train, y_train, q_train = train_data
    X_test, y_test, q_test = test_data

    ranker.fit(
        X_train,
        y_train,
        group=q_train,
        eval_set=[(X_test, y_test)],
        eval_group=[q_test],
        eval_at=[1, 3],
        early_stopping_rounds=5,
        verbose=True,
        callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.9**x * 0.1)],
    )

    return ranker
Esempio n. 21
0
    def train(self):
        global MODEL_PATH

        lgb_train, lgb_test = self.samples()

        def learning_rate(epoch, span=100):
            cycle = [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.005]
            lr = cycle[(epoch // span) % len(cycle)]
            print(f"LEARN RATE = {lr}")
            return lr

        gbm = lgb.train(
            self.params,
            lgb_train,
            num_boost_round=4000,  # +oo
            valid_sets=lgb_test,
            # init_model=MODEL_PATH,
            early_stopping_rounds=500,  # 5000
            callbacks=[lgb.reset_parameter(learning_rate=learning_rate)],
        )
        gbm.save_model(MODEL_PATH)
        self.load()
Esempio n. 22
0
    def test_joblib(self):
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=10,
                                objective=custom_asymmetric_obj,
                                silent=True,
                                importance_type='split')
        gbm.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_metric=mse,
            early_stopping_rounds=5,
            verbose=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))
            ])

        joblib.dump(gbm, 'lgb.pkl')  # test model with custom functions
        gbm_pickle = joblib.load('lgb.pkl')
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
        np.testing.assert_array_equal(gbm.feature_importances_,
                                      gbm_pickle.feature_importances_)
        self.assertAlmostEqual(gbm_pickle.learning_rate, 0.1)
        self.assertTrue(callable(gbm_pickle.objective))

        for eval_set in gbm.evals_result_:
            for metric in gbm.evals_result_[eval_set]:
                np.testing.assert_allclose(
                    gbm.evals_result_[eval_set][metric],
                    gbm_pickle.evals_result_[eval_set][metric])
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        np.testing.assert_allclose(pred_origin, pred_pickle)
Esempio n. 23
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
            metrics='l1', verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
Esempio n. 24
0
    for train_index, val_index in kf.split(dataset_values, dataset_target):

        i += 1
        val_target = dataset_target.iloc[val_index]
        val_val = dataset_values.iloc[val_index]

        train_target = dataset_target.iloc[train_index]
        train_val = dataset_values.iloc[train_index]

        # add extra columns missings in the test set
        for col in train_val.columns:
            if col not in val_val.columns:
                val_val[col] = 0
        val_val = val_val[train_val.columns]

        clf = lgb.LGBMClassifier()
        clf.set_params(**params)
        clf.fit(train_val.values, train_target,
                callbacks=[lgb.reset_parameter(learning_rate=lr_decay)])
        y_pred = clf.predict(val_val)
        correct = len([i for i, j in zip(y_pred, val_target) if i == j])
        ratio = correct/len(val_target)*100
        print(f"Accuracy f-{i}: {ratio:.3f}")
        preds += array(clf.predict_proba(test_val))

    preds = argmax(preds, axis=1)
    submission = pd.read_csv(PATH_DATA + 'SubmissionFormat.csv')
    labels = ["non functional", "functional needs repair", "functional"]
    submission['status_group'] = list(map(lambda x: labels[x], preds))
    submission.to_csv(PATH_DATA + "submission.csv", index=False)
Esempio n. 25
0
from optuna import structs
from optuna import study as study_module
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from optgbm.sklearn import OGBMClassifier
from optgbm.sklearn import OGBMRegressor
from optgbm.sklearn import _VotingBooster

n_estimators = 10
n_trials = 5
random_state = 0
callback = lgb.reset_parameter(
    learning_rate=lambda iteration: 0.05 * (0.99**iteration))
early_stopping_rounds = 3


def log_likelihood(y_true: np.ndarray,
                   y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    y_pred = 1.0 / (1.0 + np.exp(-y_pred))

    return y_pred - y_true, y_pred * (1.0 - y_pred)


def zero_one_loss(y_true: np.ndarray,
                  y_pred: np.ndarray) -> Tuple[str, np.number, bool]:
    return "zero_one_loss", np.mean(y_true != y_pred), False

Esempio n. 26
0
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

print('Finish 20 - 30 rounds with decay learning rates...')

# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finish 30 - 40 rounds with changing bagging_fraction...')


# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess

Esempio n. 27
0
def test_param(lgbm_param):
    print('in test_param')
    gc.collect()
    global y_train_new, X_train_new, y_val_new, X_val_new

    #    val_num = int(0.2*len(train_X))
    #    X_train_new = train_X.iloc[val_num:,:]
    #    y_train_new = train_y[val_num:]
    #    X_val_new = train_X.iloc[:val_num,:]
    #    y_val_new = train_y.iloc[:val_num]

    #    print('start train_test_split')
    #    X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(train_X,
    #                train_y, test_size=0.25, random_state=SEED)

    start_t = time.time()
    lgbm = lgb.LGBMClassifier(**lgbm_param)
    learning_rate_func = lgb.reset_parameter(
        learning_rate=generate_learning_rate_list())
    print('start partial trainning')
    lgbm.fit(
        X_train_new,
        y_train_new,
        eval_set=[(X_train_new, y_train_new), (X_val_new, y_val_new)],
        callbacks=[learning_rate_func],
        #            eval_metric=log_loss_def,
        eval_metric='logloss',
        #            eval_metric='auc',
        verbose=100,
        early_stopping_rounds=300)
    print('partial fit cost time: ', time.time() - start_t)

    best_iteration = lgbm.best_iteration_
    #    print('best score value is ', lgbm.best_score_)
    #    logloss_val = round(lgbm.best_score_['valid_1']['auc'], 5)
    logloss_val = round(lgbm.best_score_['valid_1']['binary_logloss'], 5)

    val_click_prob = lgbm.predict_proba(X_val_new)[:, 1]
    #    val_click_prob_new = managed_change(val_click_prob)
    print('after managed_change logloss is ',
          log_loss(y_val_new, val_click_prob),
          log_loss(y_val_new, managed_change(val_click_prob, ratio=0.01)),
          log_loss(y_val_new, managed_change(val_click_prob, ratio=0.02)),
          log_loss(y_val_new, managed_change(val_click_prob, ratio=0.05)),
          log_loss(y_val_new, managed_change(val_click_prob, ratio=0.08)),
          log_loss(y_val_new, managed_change(val_click_prob, ratio=0.10)))

    print('after managed_change auc is ',
          roc_auc_score(y_val_new, val_click_prob),
          roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.01)),
          roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.02)),
          roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.05)),
          roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.08)),
          roc_auc_score(y_val_new, managed_change(val_click_prob, ratio=0.10)))

    start_t = time.time()
    prediction_click_prob = lgbm.predict_proba(test_X)[:, 1]
    outcome_df['predicted_score'] = prediction_click_prob

    param_md5_str = convert_2_md5(lgbm_param)
    store_path = 'C:/D_Disk/data_competition/xunfei_ai_ctr/outcome/'
    partial_file_name = '_'.join(
        ['submission_partial',
         str(logloss_val), param_md5_str]) + '.csv'
    full_file_name = '_'.join(
        ['submission_full', str(logloss_val), param_md5_str]) + '.csv'

    outcome_df['predicted_score'].to_csv(store_path + partial_file_name,
                                         header=['predicted_score'])
    print('partial get predict outcome cost time: ', time.time() - start_t)

    del lgbm
    gc.collect()
    del X_train_new, X_val_new
    gc.collect()
    del y_train_new, y_val_new
    gc.collect()
    for i in range(5):
        gc.collect()


#    start_t = time.time()
#    lgbm_param['n_estimators'] = int(best_iteration*1.0)
#    print('normal full fit n_estimators is ', int(best_iteration*1.0))
#    lgbm = lgb.LGBMClassifier(**lgbm_param)
#    lgbm.fit(train_X, train_y)
#    print('normal full fit cost time: ', time.time()-start_t)
#
#    start_t = time.time()
#    prediction_click_prob = lgbm.predict_proba(test_X)[:,1]
#    outcome_df['predicted_score'] = prediction_click_prob
#    outcome_df['predicted_score'].to_csv(store_path+full_file_name,
#           header=['predicted_score'])
#    print('normal full predict cost time: ', time.time()-start_t)

    start_t = time.time()
    lgbm_param['n_estimators'] = int(best_iteration * 1.1)
    print('extra full fit n_estimators is ', int(best_iteration * 1.1))
    lgbm = lgb.LGBMClassifier(**lgbm_param)

    learning_rate_func = lgb.reset_parameter(
        learning_rate=generate_learning_rate_list()
        [:lgbm_param['n_estimators']])

    lgbm.fit(train_X, train_y, callbacks=[learning_rate_func])
    print('extra full fit cost time: ', time.time() - start_t)

    start_t = time.time()
    prediction_click_prob = lgbm.predict_proba(test_X)[:, 1]
    outcome_df['predicted_score'] = prediction_click_prob
    outcome_df['predicted_score'].to_csv(store_path + full_file_name,
                                         header=['predicted_score'])
    print('extra full predict cost time: ', time.time() - start_t)

    write_to_log('-' * 25, ' md5 value: ', param_md5_str, '-' * 25)
    write_to_log('param: ', lgbm_param)
    write_to_log('best_iteration: ', best_iteration)
    write_to_log('valid rmse: ', logloss_val)
    write_to_log('-' * 80 + '\n')
tune_params = {
    'n_estimators': [200, 500, 1000, 2500, 5000],
    'max_depth': sp_randint(4, 12),
    'colsample_bytree': sp_uniform(loc=0.8, scale=0.15),
    'min_child_samples': sp_randint(60, 120),
    'subsample': sp_uniform(loc=0.75, scale=0.25),
    'reg_lambda': [1e-3, 1e-2, 1e-1, 1]
}

fit_params = {
    'early_stopping_rounds': 40,
    'eval_metric': 'accuracy',
    'eval_set': [(X_train, y_train), (X_val, y_val)],
    'verbose': 20,
    'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)]
}

lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary', random_state=1)
gs = RandomizedSearchCV(estimator=lgb_clf,
                        param_distributions=tune_params,
                        n_iter=40,
                        scoring='f1',
                        cv=5,
                        refit=True,
                        random_state=1,
                        verbose=True)
lgb_clf = lgb.LGBMClassifier(n_jobs=4,
                             objective='multiclass',
                             random_state=100)
opt_params = {
Esempio n. 29
0
        reg_alpha=0.0,
        reg_lambda=100.0,
        scale_pos_weight=1.0,
        subsample=1.0,
        subsample_freq=1,
        random_state=n_fold)

    clf.fit(
        trn_x,
        trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric='auc',
        verbose=1000,
        early_stopping_rounds=600,
        callbacks=[
            lgb.reset_parameter(
                learning_rate=[200 / (8000 + x) for x in range(10000)])
        ],
        #           categorical_feature=CATEGORICAL_COLUMNS #30
        categorical_feature=small_cat  #30   
    )

    oof_preds[val_idx] = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += rankdata(
        clf.predict_proba(test[feats], num_iteration=clf.best_iteration_)
        [:, 1]) / folds.n_splits / len(sub_preds)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3


#setup lgb parameters
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'mae', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': -1,
            'categorical_feature': 'auto',
            }

#starting parameters
param_test ={
# =============================================================================
#              'num_leaves': sp_randint(100, 1000),
#              'max_depth': sp_randint(1, 10),
#              'min_data_in_leaf': sp_randint(1, 100),
# =============================================================================
# =============================================================================
#              'min_child_samples': sp_randint(100, 1000), 
#              'min_child_weight': sp_uniform(loc=0, scale=1.0),#[1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'subsample': sp_uniform(loc=0.2, scale=0.8), 
Esempio n. 31
0
    print(f'{i}:{attr} FOLD:{fold}')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    ys.append(y_valid)

    #print(X_train.shape, y_train.shape)
    
    fit_params={"early_stopping_rounds":300, 
                "eval_metric" : evaluate_macroF1_lgb, 
                "eval_set" : [(X_valid,y_valid)],
                'eval_names': ['valid'],
                #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
                'verbose': False,
                'categorical_feature': 'auto'}

    fit_params['callbacks'] = [lgb.reset_parameter(learning_rate=learning_rate_power_0997)]
    
    opt_parameters = {
                      #'colsample_bytree': 0.9221304051471293, 
                      'min_child_samples': 150, 
                      'num_leaves': 2, 
                      #'subsample': 0.9510118790770111, 
                      'class_weight': 'balanced', 
                      'lambda_l1': 1.79,
                      'lambda_l2': 1.71,
                      'num_trees': 2000
                      }
    #clf_final = lgb.LGBMClassifier(**clf.get_params())
    #clf_final.set_params(**opt_parameters)
    clf_final = lgb.LGBMClassifier(bagging_fraction=0.9957236684465528, boosting_type='gbdt',
        class_weight='balanced', colsample_bytree=0.7953949538181928,
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

print('Finished 20 - 30 rounds with decay learning rates...')

# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')


# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess

Esempio n. 33
0
    "random_state": 1337
}

dtrain = lgb.Dataset(X_train, label=y_train)
dvalid = lgb.Dataset(X_test, label=y_test)
evals_result = {}
clf_lgb = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=500,
    early_stopping_rounds=50,
    verbose_eval=100,
    feature_name=df.iloc[:, 4:].columns.tolist(),
    callbacks=[
        lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)
    ],
    evals_result=evals_result)

y_pred1 = clf_lgb.predict(X_train, num_iteration=clf_lgb.best_iteration)
y_pred1 = np.argmax(y_pred1, axis=1)
y_pred2 = clf_lgb.predict(X_test, num_iteration=clf_lgb.best_iteration)
y_pred2 = np.argmax(y_pred2, axis=1)
score1 = accuracy_score(y_train, y_pred1) * 100
score2 = accuracy_score(y_test, y_pred2) * 100
print("\nLGB Model Report")
print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2)))

# hyperparameters tuning
clf_lgb = lgb.LGBMClassifier(learning_rate=0.1,
                             n_estimators=100,
Esempio n. 34
0
 def test_cv(self):
     lgb_train, _ = template.test_template(return_data=True)
     lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])