Example #1
0
    def train(
        self,
        tr_x: pd.DataFrame,
        tr_y: pd.DataFrame,
        va_x: pd.DataFrame = None,
        va_y: pd.DataFrame = None,
        te_x: pd.DataFrame = None,
    ) -> None:

        # データのセット
        validation = va_x is not None
        lgb_train = optuna_lgb.Dataset(
            tr_x,
            tr_y,
            categorical_feature=self.categorical_features,
            free_raw_data=False,
        )
        if validation:
            lgb_eval = optuna_lgb.Dataset(
                va_x,
                va_y,
                reference=lgb_train,
                categorical_feature=self.categorical_features,
                free_raw_data=False,
            )

        # ハイパーパラメータの設定
        params = dict(self.params)
        num_round = params.pop("num_boost_round")
        best_params: Dict[str, Any] = dict()
        tuning_history: List[Any] = list()

        # 学習
        if validation:
            early_stopping_rounds = params.pop("early_stopping_rounds")
            self.model = optuna_lgb.train(
                params,
                lgb_train,
                num_round,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=1000,
                early_stopping_rounds=early_stopping_rounds,
                best_params=best_params,
                tuning_history=tuning_history,
            )
        else:
            self.model = optuna_lgb.train(
                params,
                lgb_train,
                num_round,
                valid_sets=[lgb_train],
                verbose_eval=1000,
                best_params=best_params,
                tuning_history=tuning_history,
            )
        print("Best Params:", best_params)
        with open(f"../output/model/{self.run_fold_name}_best_params.json",
                  "w") as f:
            json.dump(best_params, f, indent=4, separators=(",", ": "))
Example #2
0
    def train(
        self,
        X_tr: pd.DataFrame,
        y_tr: pd.Series,
        X_val: Optional[pd.DataFrame] = None,
        y_val: Optional[pd.Series] = None,
        **kwargs,
    ) -> None:
        # データのセット
        is_validation = X_val is not None
        lgb_train = optuna_lgb.Dataset(
            X_tr,
            y_tr,
            categorical_feature=self.categorical_features,
            free_raw_data=False,
        )
        if is_validation:
            lgb_eval = optuna_lgb.Dataset(
                X_val,
                y_val,
                reference=lgb_train,
                categorical_feature=self.categorical_features,
                free_raw_data=False,
            )

        # ハイパーパラメータの設定
        params = self.params.copy()
        if "num_boost_round" in params.keys():
            num_round = params.pop("num_boost_round")
        elif "n_estimators" in params.keys():
            num_round = params.pop("n_estimators")
        else:
            print(
                "[WARNING] num_round is set to 100: `num_boost_round` or `n_estimators` are not in the params"
            )
            num_round = 100

        # 学習
        if is_validation:
            early_stopping_rounds = params.pop("early_stopping_rounds")
            self.model = optuna_lgb.train(  # type: ignore
                params,
                lgb_train,
                num_round,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=1000,
                early_stopping_rounds=early_stopping_rounds,
                **kwargs,
            )
        else:
            self.model = optuna_lgb.train(  # type: ignore
                params,
                lgb_train,
                num_round,
                valid_sets=[lgb_train],
                verbose_eval=1000,
                **kwargs,
            )
Example #3
0
 def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):
     # データのセット
     validation = va_x is not None
     lgb_train = optuna_lgb.Dataset(
         tr_x,
         tr_y,
         categorical_feature=self.categorical_features,
         free_raw_data=False,
     )
     if validation:
         lgb_eval = optuna_lgb.Dataset(
             va_x,
             va_y,
             reference=lgb_train,
             categorical_feature=self.categorical_features,
             free_raw_data=False,
         )
     # ハイパーパラメータの設定
     params = dict(self.params)
     num_round = params.pop("num_boost_round")
     best_params, tuning_history = dict(), list()
     # 学習
     if validation:
         early_stopping_rounds = params.pop("early_stopping_rounds")
         self.model = optuna_lgb.train(
             params,
             lgb_train,
             num_round,
             valid_sets=[lgb_train, lgb_eval],
             verbose_eval=1000,
             early_stopping_rounds=early_stopping_rounds,
             best_params=best_params,
             tuning_history=tuning_history,
         )
     else:
         self.model = optuna_lgb.train(
             params,
             lgb_train,
             num_round,
             valid_sets=[lgb_train],
             verbose_eval=1000,
             best_params=best_params,
             tuning_history=tuning_history,
         )
     print(f"Best Params: {best_params}")
     with open(
         f"{self.optuna_path}/{self.run_fold_name}_best_params.json", "w"
     ) as f:
         json.save(best_params, f, indent=4, separators=(",", ": "))
def lgb_cv_tune(_train, _test, _target, model_params, train_params, cat_idx,
                fold_schema):
    oof = np.zeros(len(_train))
    predictions = np.zeros(len(_test))

    for fold_idx, (trn_idx, val_idx) in enumerate(fold_schema.split(_train)):
        print('Fold {}/{}'.format(fold_idx + 1, fold_schema.n_splits))
        trn_data = lgb.Dataset(_train.iloc[trn_idx],
                               label=_target.iloc[trn_idx])
        val_data = lgb.Dataset(_train.iloc[val_idx],
                               label=_target.iloc[val_idx])

        # LightGBMTuner
        # Reference:
        # https://gist.github.com/smly/367c53e855cdaeea35736f32876b7416
        best_params = {}
        tuning_history = []

        optuna_lgb.train(model_params,
                         trn_data,
                         num_boost_round=10000,
                         valid_sets=[trn_data, val_data],
                         best_params=best_params,
                         tuning_history=tuning_history,
                         **train_params)

        pd.DataFrame(tuning_history).to_csv(
            dataset_path / 'tuning_history_{}.csv'.format(fold_idx + 1))

        best_params['learning_rate'] = 0.05

        # origin LightGBM Model
        model = lgb.train(best_params,
                          trn_data,
                          num_boost_round=20000,
                          valid_names=['train', 'valid'],
                          valid_sets=[trn_data, val_data],
                          **train_params)

        oof[val_idx] = model.predict(_train.iloc[val_idx],
                                     num_iteration=model.best_iteration)
        print(
            mean_absolute_error(np.expm1(_target.iloc[val_idx]),
                                np.expm1(oof[val_idx])))
        predictions += model.predict(
            _test, num_iteration=model.best_iteration) / fold_schema.n_splits
    print(mean_absolute_error(np.expm1(_target), np.expm1(oof)))

    return predictions
Example #5
0
 def tune(self,
          train_features,
          valid_features,
          input_features,
          target,
          categorical_features,
          fit_kwargs=dict()):
     train_features_casted = self.cast_dataframe(train_features,
                                                 input_features, target,
                                                 categorical_features)
     valid_features_casted = self.cast_dataframe(valid_features, input_features, target, categorical_features) \
                             if valid_features is not None else None
     model_params = dict(self.model_params)
     ntrees_param_key = fetch_param_key(model_params, NTREES_LGB_ALIASES)
     early_stop_param_key = fetch_param_key(model_params,
                                            EARLY_STOP_LGB_ALIASES)
     training_params = {"train_set": train_features_casted}
     if valid_features is not None:
         training_params["valid_sets"] = valid_features_casted
         if early_stop_param_key is not None:
             training_params["early_stopping_rounds"] = model_params.pop(
                 early_stop_param_key)
     elif early_stop_param_key is not None:
         del model_params[early_stop_param_key]
     if ntrees_param_key is not None:
         training_params["num_boost_round"] = model_params.pop(
             ntrees_param_key)
     training_params["params"] = model_params
     # model training
     self.model = olgb.train(**training_params, **fit_kwargs)
     self.best_iteration = self.model.best_iteration if self.model.best_iteration > 0 else self.model.num_trees(
     )
     self.input_features = input_features
     self.target = target
     self.categorical_features = categorical_features
Example #6
0
def train_and_predict(X_train, X_valid, y_train, y_valid, X_test, lgbm_params):

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    logging.debug(lgbm_params)

    # ロガーの作成
    logger = logging.getLogger('main')
    callbacks = [log_evaluation(logger, period=30)]

    best_params, history = {}, []
    # 上記のパラメータでモデルを学習する
    model = lgb.train(
        lgbm_params,
        lgb_train,
        # モデルの評価用データを渡す
        valid_sets=lgb_eval,
        # 学習過程の表示をoff
        verbose_eval=False,
        # 最大で 1000 ラウンドまで学習する
        num_boost_round=1000,
        # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る
        early_stopping_rounds=10,
        # 最適なパラメータの採用
        best_params=best_params,
        tuning_history=history,
        # ログ
        callbacks=callbacks)

    # テストデータを予測する
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    return y_pred, model
Example #7
0
def objective_lgb(x, y):

    x_train_a, x_valid_a, y_train, y_valid = train_test_split(x, y, train_size=0.8, random_state=5, shuffle=True)

    drop_col = x_train_a.std(axis=0, ddof=1)!=0
    x_train = x_train_a.loc[:, drop_col] 
    x_valid = x_valid_a.loc[:, drop_col] 

    autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
    autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)
    autoscaled_x_valid = (x_valid - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
    autoscaled_y_valid = (y_valid - y_train.mean()) / y_train.std(ddof=1)

    trains = lightgbm.Dataset(autoscaled_x_train.values, autoscaled_y_train.values)
    valids = lightgbm.Dataset(autoscaled_x_valid.values, autoscaled_y_valid.values)

    params = {
        'objective':'mean_squared_error',
        'metric':'rmse'
    }
    best_params, history = {}, []
    model = lgb.train(
        params, trains, valid_sets=valids,
        verbose_eval=False,
        # num_boost_round=100,
        # early_stopping_rounds=5,
        # best_params=best_params,
        # tuning_history=history,
        # force_row_wise=True
    )
    best_params=model.params
    return best_params
Example #8
0
def run_lgb(train,valid,LOG):
    # lgb_params = {
    #             'n_estimators': 24000,
    #             'objective': 'binary',
    #             'boosting_type': 'gbdt',
    #             'metric': 'auc',
    #             'max_depth': 7,
    #             'learning_rate': 0.2,
    #             'seed': 127,
    #             'early_stopping_rounds': 50
    #         }
    lgb_params = {'objective': 'binary',
              'seed': 127,
              'boosting_type': 'gbdt',
              'metric': 'auc'
             }

    train_x,valid_x = train[USE_COLS],valid[USE_COLS]
    train_y,valid_y = train[TARGET],valid[TARGET]
    del train,valid
    gc.collect()

    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_eval = lgb.Dataset(valid_x, valid_y)
    LOG.info(f'start lgb train')
    t0 = time.time()

    booster = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval,verbose_eval=0)
    print(booster.params)
    LOG.info(booster.params)
    LOG.info(f'end lgb train : {time.time() - t0} s')

    with open('./models/optuna_lgb.pkl','rb') as f:
        pickle.dump(booster)
Example #9
0
    def learning_race_lgb(self, this_model_name, target):
        # テスト用のデータを評価用と検証用に分ける
        X_eval, X_valid, y_eval, y_valid = train_test_split(self.X_test,
                                                            self.y_test,
                                                            random_state=42)

        # データセットを生成する
        lgb_train = lgb.Dataset(self.X_train, self.y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        if self.test_flag:
            num_boost_round = 5
            early_stopping_rounds = 3
        else:
            num_boost_round = 1000
            early_stopping_rounds = 50

        # 上記のパラメータでモデルを学習する
        best_params, history = {}, []
        this_param = self.lgbm_params[target]
        model = lgb.train(this_param,
                          lgb_train,
                          valid_sets=lgb_eval,
                          verbose_eval=False,
                          num_boost_round=num_boost_round,
                          early_stopping_rounds=early_stopping_rounds,
                          best_params=best_params,
                          tuning_history=history)
        print("Bset Paramss:", best_params)
        print('Tuning history:', history)

        self._save_learning_model(model, this_model_name)
Example #10
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # データのセット
        validation = va_x is not None
        lgb_train = optuna_lgb.Dataset(
            tr_x,
            tr_y,
            categorical_feature=self.categorical_features,
            free_raw_data=False)
        if validation:
            lgb_eval = optuna_lgb.Dataset(
                va_x,
                va_y,
                reference=lgb_train,
                categorical_feature=self.categorical_features,
                free_raw_data=False)

        # ハイパーパラメータの設定
        params = dict(self.params)
        num_round = params.pop('num_boost_round')
        best_params, tuning_history = dict(), list()

        # 学習
        if validation:
            early_stopping_rounds = params.pop('early_stopping_rounds')
            self.model = optuna_lgb.train(
                params,
                lgb_train,
                num_round,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=1000,
                early_stopping_rounds=early_stopping_rounds,
                best_params=best_params,
                tuning_history=tuning_history)
        else:
            self.model = optuna_lgb.train(params,
                                          lgb_train,
                                          num_round,
                                          valid_sets=[lgb_train],
                                          verbose_eval=1000,
                                          best_params=best_params,
                                          tuning_history=tuning_history)
        print('Best Params:', best_params)
        with open(f'../output/model/{self.run_fold_name}_best_params.json',
                  'w') as f:
            json.dump(best_params, f, indent=4, separators=(',', ': '))
Example #11
0
 def train(self, tr_x, tr_y, va_x=None, va_y=None):
     # データのセット
     validation = va_x is not None
     lgb_train = optuna_lgb.Dataset(
         tr_x,
         tr_y,
         categorical_feature=self.categorical_features,
         free_raw_data=False,
     )
     lgb_eval = None
     if validation:
         lgb_eval = optuna_lgb.Dataset(
             va_x,
             va_y,
             reference=lgb_train,
             categorical_feature=self.categorical_features,
             free_raw_data=False,
         )
     # ハイパーパラメータの設定
     params = dataclasses.asdict(self.params)
     num_round = params.pop("num_boost_round")
     # 学習
     if validation:
         early_stopping_rounds = params.pop("early_stopping_rounds")
         self.model = optuna_lgb.train(
             params,
             lgb_train,
             num_round,
             valid_sets=[lgb_train, lgb_eval],
             verbose_eval=500,
             early_stopping_rounds=early_stopping_rounds,
         )
     else:
         self.model = optuna_lgb.train(
             params,
             lgb_train,
             num_round,
             valid_sets=[lgb_train],
             verbose_eval=500,
         )
     best_params = self.model.params
     Logger().info(f"Optuna Best Params: {best_params}")
     with open(f"{ModelPath.optuna}/{self.run_fold_name}_best_params.json",
               "w") as f:
         json.dump(best_params, f, indent=4, separators=(",", ": "))
Example #12
0
def train(X_train, X_eval, y_train, y_eval) -> Booster:
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)
    lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
    }
    model = lgb.train(lgb_params, lgb_train,
                      valid_sets=lgb_eval, early_stopping_rounds=20,
                      num_boost_round=300, verbose_eval=False)
    return model
Example #13
0
    def fit(self,
            tr_x,
            tr_y,
            va_x=None,
            va_y=None,
            cat_features=None,
            feval=None):
        if cat_features is not None:
            tr_x[cat_features] = tr_x[cat_features].astype('category')
            va_x[cat_features] = va_x[cat_features].astype('category')

        validation = va_x is not None
        lgb_train = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_features)
        if validation:
            lgb_eval = lgb.Dataset(va_x,
                                   va_y,
                                   reference=lgb_train,
                                   categorical_feature=cat_features)

        callbacks = [self._log_evaluation(period=100)]

        if self.cfg.task_type in ['regression', 'classification']:
            self.model = lgb.train(
                self.params,
                lgb_train,
                num_boost_round=self.cfg.num_boost_round,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=self.cfg.verbose_eval,
                early_stopping_rounds=self.cfg.early_stopping_rounds,
                callbacks=callbacks,
                feval=feval)
        elif self.cfg.task_type == 'optuna':
            self.model = lgb_tuner.train(
                self.params,
                lgb_train,
                num_boost_round=self.cfg.num_boost_round,
                valid_sets=[lgb_train, lgb_eval],
                best_params=self.best_params,
                tuning_history=self.tuning_history,
                verbose_eval=self.cfg.verbose_eval,
                early_stopping_rounds=self.cfg.early_stopping_rounds,
                callbacks=callbacks)

            print('Number of finished trials: {}'.format(
                len(self.tuning_history)))
            print('Best params:', self.best_params)
            print('  Params: ')
            for key, value in self.best_params.items():
                print('    {}: {}'.format(key, value))
Example #14
0
    def fit(self,
            df_train,
            splitter=SequenceSplitter(test_rate=0.1),
            categorical_columns=[],
            parameters=None):
        if parameters is None:
            self.parameters = get_lgbm_default_parameters(self.objective_var)
        if self.metric is None:
            if self.objective_var == 'binary_class':
                self.parameters['metric'] = 'binary_logloss'
            elif self.objective_var == 'multi_class':
                self.parameters['metric'] = 'multi_logloss'
            elif self.objective_var == 'regression':
                self.parameters['metric'] = 'l2'
        else:
            self.parameters['metric'] = self.metric

        df_splitted_train, df_valid = splitter.split(df_train)
        X_train = df_splitted_train.drop(
            self.target_column, axis=1
        ) if self.target_column in df_splitted_train.columns else df_splitted_train
        X_valid = df_valid.drop(
            self.target_column,
            axis=1) if self.target_column in df_valid.columns else df_valid
        X_train = X_train.fillna(-1)
        X_valid = X_valid.fillna(-1)

        train_data = lgb.Dataset(
            X_train,
            label=df_splitted_train[self.target_column].values.tolist(),
            categorical_feature=categorical_columns)

        val_data = lgb.Dataset(
            X_valid,
            label=df_valid[self.target_column].values.tolist(),
            categorical_feature=categorical_columns)
        if self.use_optuna:
            self.model = optuna_lgb.train(self.parameters,
                                          train_data,
                                          valid_sets=val_data,
                                          verbose_eval=0)
        else:
            self.model = lgb.train(self.parameters,
                                   train_data,
                                   valid_sets=val_data,
                                   verbose_eval=100,
                                   num_boost_round=10000,
                                   early_stopping_rounds=50)
Example #15
0
def train_with_lightgbm(X_train: pd.DataFrame,
                        y_train: pd.Series,
                        X_valid: pd.DataFrame,
                        y_valid: pd.Series,
                        params: Dict[str, Any],
                        tune: bool = False,
                        **kwargs) -> lgb.Booster:
    """
    Function to train lightgbm model.

    Args:
        X_train (pd.DataFrame): Training Data.
        y_train (pd.Series): Target for train.
        X_valid (pd.DataFrame): Validation Data.
        y_valid (pd.Series): Target for validation.
        params (Dict[str, Any]): LightGBM parameters.
        tune (bool, optional): If run tuning or not. Defaults to False.

    Returns:
        [lgb.Booster]: Trained model.
    """
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)

    if not tune:
        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train", "valid"],
            **kwargs,
        )
    else:
        model = lgb_tuner.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train", "valid"],
            **kwargs,
        )
    return model
Example #16
0
def hyper_tuning(cv_dict):
    """
    cross-validation model of lightgbm for the purpose of hypertuning

    :param cv_dict: dictionary
        Collected dictionary of X_train, y_train, X_test, y_test for given week folds
    :return: lightgbm model
        cross-validation trained lightgbm model
    """
    import optuna.integration.lightgbm as lightgb

    dtrain = lightgb.Dataset(cv_dict['X_train'][0],
                             label=cv_dict['y_train'][0])
    X_test0, y_test0 = downsample(cv_dict['X_test'][0], cv_dict['y_test'][0])
    dval0 = lightgb.Dataset(X_test0, label=y_test0)
    X_test1, y_test1 = downsample(cv_dict['X_test'][1], cv_dict['y_test'][1])
    dval1 = lightgb.Dataset(X_test1, label=y_test1)
    X_test2, y_test2 = downsample(cv_dict['X_test'][2], cv_dict['y_test'][2])
    dval2 = lightgb.Dataset(X_test2, label=y_test2)
    X_test3, y_test3 = downsample(cv_dict['X_test'][3], cv_dict['y_test'][3])
    dval3 = lightgb.Dataset(X_test3, label=y_test3)

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    # cross-validation of lightgb model
    lgb_clf = lightgb.train(params,
                            dtrain,
                            categorical_feature=[
                                'shopper', 'product', 'category', 'coupon',
                                'coupon_in_same_category'
                            ],
                            valid_sets=[dval0, dval1, dval2],
                            verbose_eval=100,
                            early_stopping_rounds=100)
    return lgb_clf
Example #17
0
    def learning_race_lgb(self, this_model_name):
        # テスト用のデータを評価用と検証用に分ける
        X_eval, X_valid, y_eval, y_valid = train_test_split(self.X_test,
                                                            self.y_test,
                                                            random_state=42)

        # データセットを生成する
        lgb_train = lgb.Dataset(self.X_train, self.y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        # 上記のパラメータでモデルを学習する
        model = lgb.train(
            self.lgbm_params,
            lgb_train,
            # モデルの評価用データを渡す
            valid_sets=lgb_eval,
            # 最大で 1000 ラウンドまで学習する
            num_boost_round=1000,
            # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る
            early_stopping_rounds=10)

        self._save_learning_model(model, this_model_name)
def objective(trial):
    X, y = ember.read_vectorized_features('./sample/merge', 20000, 3154)

    train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.4, random_state=777)
    valid_x, test_x, valid_y, test_y = train_test_split(val_x, val_y, test_size=0.5, random_state=777)
    sc = StandardScaler()
    train_x = sc.fit_transform(train_x)
    valid_x = sc.transform(valid_x)
    test_x = sc.transform(test_x)

    train_data_set = lgb.Dataset(train_x, train_y)
    valid_data_sets = lgb.Dataset(valid_x, valid_y)

    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        "verbosity": -1,
        "boosting_type": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        # 'num_leaves': 2048,  # 전체 트리의 leave 수, 디폴트값 31
        # 'max_depth': 16,  # 트리 최대 깊이
        # 'min_data_in_leaf': 1000,  # 리프가 갖는 최소한의 레코드, 디폴트값은  20으로 최적의 값
        # 'num_iterations': 1000,  # 1000 -> 1500
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }

    gbm = lgb.train(param, train_data_set, valid_sets=[valid_data_sets], verbose_eval=False)
    pred_y = gbm.predict(test_x)
    y_pred = np.where(np.array(pred_y) > 0.7, 1, 0)
    accuracy = sklearn.metrics.accuracy_score(test_y, y_pred)
    return accuracy
    dval = lgb.Dataset(val_x, label=val_y)

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    best_params, tuning_history = dict(), list()

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        best_params=best_params,
        tuning_history=tuning_history,
        verbose_eval=100,
        early_stopping_rounds=100,
    )

    prediction = np.rint(model.predict(val_x, num_iteration=model.best_iteration))
    accuracy = accuracy_score(val_y, prediction)

    print("Number of finished trials: {}".format(len(tuning_history)))
    print("Best params:", best_params)
    print("  Accuracy = {}".format(accuracy))
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))
def train(data: pd.DataFrame, cols_to_keep: Tuple[str] = None, filter_by_vif: bool = False) -> Tuple[
    linear_model.Lasso, LGBMRegressor, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray, np.ndarray
]:
    """
    Trains model(s) on feature data to predict user ratings.
    Currently used models:
        - Linear regression.
        - LightGBM
    :param data: Dataframe to predict.
    :param cols_to_keep: List of columns to use in model.
    :param filter_by_vif: Whether to filter columns by their VIF score.
    :return: Trained lasso estimator, trained LGBM estimator, dataframe with evaluation data, dataframe with selected
    features, test feature set, test labels, entire feature set, all labels.
    """

    target: str = "rating"
    data = data.drop(columns="separability_metric", errors="ignore")

    if filter_by_vif and len(data.columns) > 2:
        rating: pd.Series = data.rating
        data = filter_features_by_vif(data.drop(columns="rating"))
        data["rating"] = rating

    if cols_to_keep:
        data = data[[*[col for col in cols_to_keep if col in data.columns], "rating"]]
    metrics: List[Dict] = []

    # 1. With linear regression.
    print("=== Linear regression ===")
    n_splits: int = 100
    lasso_estimator: Optional[linear_model.Lasso] = None
    pbar: tqdm = tqdm(total=n_splits)
    for train_indices, test_indices in ShuffleSplit(n_splits=n_splits, test_size=.2).split(data):
        features: np.ndarray = data.drop(columns=target).values

        # Split in train and test set.
        train_feats: np.ndarray = features[train_indices, :]
        train_labels: np.ndarray = data[[target]].values[train_indices, :]
        test_feats: np.ndarray = features[test_indices, :]
        test_labels: np.ndarray = data[[target]].values[test_indices, :]

        # Normalize features.
        scaler: StandardScaler = StandardScaler()
        scaler.fit(train_feats)

        lasso_estimator = linear_model.Lasso(alpha=0.015, max_iter=2000)

        # Train model.
        lasso_estimator.fit(train_feats, train_labels)
        test_labels_predicted: np.ndarray = lasso_estimator.predict(test_feats)
        # print('Coefficients: \n', estimator.coef_)

        metrics.append({
            "model": "lasso",
            "mean_squared_error": mean_squared_error(test_labels, test_labels_predicted),
            "mean_absolute_error": mean_absolute_error(test_labels, test_labels_predicted),
            "median_absolute_error": median_absolute_error(test_labels, test_labels_predicted),
            "explained_variance": explained_variance_score(test_labels, test_labels_predicted)
        })

        pbar.update(1)
    pbar.close()

    # 2. With boosting (LightGBM).
    n_splits: int = 20
    print("=== Boosting ===")
    pbar: tqdm = tqdm(total=n_splits)
    best_params: dict = dict()
    tuning_history: list = list()
    lgbm_estimator: Optional[optuna_lgbm.Booster] = None
    test_feats: Optional[np.ndarray] = None
    test_labels: Optional[np.ndarray] = None
    cols: list = data.drop(columns=target).columns

    for train_indices, test_indices in ShuffleSplit(n_splits=n_splits, test_size=.2).split(data):
        features: np.ndarray = data.drop(columns=target).values

        # Split in train and test set.
        train_feats: np.ndarray = features[train_indices, :]
        train_labels: np.ndarray = data[[target]].values[train_indices, :]
        test_feats: np.ndarray = features[test_indices, :]
        test_labels: np.ndarray = data[[target]].values[test_indices, :]

        scaler: StandardScaler = StandardScaler()
        scaler.fit(train_feats)
        train_feats = scaler.transform(train_feats)
        test_feats = scaler.transform(test_feats)

        if True or not len(best_params):
            # Split train set in train and validation set.
            train_feats, val_feats, train_labels, val_labels = train_test_split(
                train_feats, train_labels, test_size=0.2
            )

            dtrain: lgbm.Dataset = lgbm.Dataset(
                pd.DataFrame(train_feats, columns=cols),
                label=train_labels.ravel().tolist(),
                params={'verbose': -1}
            )
            dval: lgbm.Dataset = lgbm.Dataset(
                pd.DataFrame(val_feats, columns=cols),
                label=val_labels.ravel().tolist(),
                params={'verbose': -1}
            )

            params: dict = {
                "objective": "regression",
                "metric": "l2",
                "verbosity": -1,
                "verbose": -1,
                "silent": True,
                "boosting_type": "gbdt",
            }

            lgbm_estimator: optuna_lgbm.Booster = optuna_lgbm.train(
                params,
                dtrain,
                valid_sets=[dtrain, dval],
                early_stopping_rounds=100,
                verbosity=-1,
                verbose_eval=False,
                best_params=best_params,
                tuning_history=tuning_history
            )
            test_labels_predicted: np.ndarray = lgbm_estimator.predict(
                pd.DataFrame(test_feats, columns=cols), num_iteration=lgbm_estimator.best_iteration
            )
        else:
            lgbm_estimator: lgbm.LGBMRegressor = lgbm.LGBMRegressor(**best_params)
            lgbm_estimator.fit(train_feats, train_labels)
            test_labels_predicted: np.ndarray = lgbm_estimator.predict(test_feats)

        metrics.append({
            "model": "lgbm",
            "mean_squared_error": mean_squared_error(test_labels, test_labels_predicted),
            "mean_absolute_error": mean_absolute_error(test_labels, test_labels_predicted),
            "median_absolute_error": median_absolute_error(test_labels, test_labels_predicted),
            "explained_variance": explained_variance_score(test_labels, test_labels_predicted)
        })

        pbar.update(1)
    pbar.close()

    return lasso_estimator, lgbm_estimator, pd.DataFrame(metrics), data, test_feats, test_labels, \
           data.drop(columns=target).values, data[target].values
Example #21
0
target_column = ["target_ord"]
X_train, X_val, y_train, y_val = train_test_split(
    df[feature_columns], df[target_column],
    test_size=0.3, random_state=42,
    stratify=df[target_column]
)
dtrain = lgb_org.Dataset(X_train, y_train)
dval = lgb_org.Dataset(X_val, y_val)
params = dict(
    objective="multiclass",
    metric="multi_logloss",
    num_class=9,
    seed=42,
)

best_params, tuning_history = dict(), list()
booster = lgb.train(params, dtrain, valid_sets=dval,
                    verbose_eval=0,
                    best_params=best_params,
                    early_stopping_rounds=5,
                    tuning_history=tuning_history)
 
print("Best Params:", best_params)
print("Tuning history:", tuning_history)
df_test = read_csv(str(base_dir / "test.csv"))
pred = booster.predict(df_test[feature_columns])
for idx, col in order_to_class.items():
    df_test[col] = pred[:,idx]
df_test[["id"] + [f"Class_{i}" for i in range(1, 10)]].to_csv('submission.csv', index=False)

Example #22
0
def train_lgbm(df, cfg, retrain=False):
    # train_validation split

    cwd = hydra.utils.get_original_cwd()

    df_calendar = pd.read_csv(
        os.path.join(cwd, "../input/m5-forecasting-accuracy/calendar.csv"))
    df_prices = pd.read_csv(
        os.path.join(cwd, "../input/m5-forecasting-accuracy/sell_prices.csv"))

    df_sales = pd.read_csv(
        os.path.join(
            cwd,
            "../input/m5-forecasting-accuracy/sales_train_evaluation.csv"))

    cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id']
    cat_feats.extend([
        "event_name_1", "event_name_2", "event_type_1", "event_type_2", "wday",
        "month", "year", "snap_flag"
    ])

    useless_cols = [
        "id", "date", "sales", "d", "wm_yr_wk", "weekday", "state_name",
        "snap_CA", "snap_TX", "snap_WI"
    ]

    if cfg.lgbm.optuna_tuning:
        import optuna.integration.lightgbm as lgb
    else:
        import lightgbm as lgb

    fold_val_scores = dict()
    """
    2016/3/24 ~ 2016/4/24 : public lb
    2016/2/24 ~ 2016/3/24 : fold1 validation set
    2016/1/24 ~ 2016/2/24 : fold2 validation set
    """

    for fold_idx in range(1, 1 + n_folds, 1):
        print("*" * 20)
        print(f"fold {fold_idx}...")
        print("*" * 20)

        val_firstdate = dev_lastdate - timedelta(days=val_days * fold_idx)
        val_lastdate = dev_lastdate - timedelta(days=val_days * (fold_idx - 1))
        train_lastdate = val_firstdate - timedelta(1)

        print("train period:", dev_firstdate.date(), "~",
              train_lastdate.date())
        print("validation period:", val_firstdate.date(), "~",
              val_lastdate.date())

        train_df = df.query("date < @val_firstdate")
        val_df = df.query("@val_lastdate >= date > @train_lastdate")

        val_df_wrmsse = df_sales.iloc[:, -28:]

        # wrmsse_evaluator = eval_metrics.WRMSSEEvaluator(df_sales.iloc[:, :-28],
        #                                                 val_df_wrmsse,
        #                                                 calendar=df_calendar,
        #                                                 prices=df_prices,
        #                                                 val_firstdate=date2d(val_firstdate),
        #                                                 val_lastdate=date2d(val_lastdate),
        #                                                 converted_val_df=val_df,
        #                                                 )

        del df
        gc.collect()

        print(min(train_df["date"]), max(train_df["date"]))
        print(min(val_df["date"]), max(val_df["date"]))

        train_df[:500].dropna(inplace=True)
        train_cols = train_df.columns[~train_df.columns.isin(useless_cols)]

        train_data = lgb.Dataset(train_df[train_cols],
                                 label=train_df["sales"],
                                 free_raw_data=False)
        val_data = lgb.Dataset(val_df[train_cols],
                               label=val_df["sales"],
                               free_raw_data=False)

        del train_df
        gc.collect()

        lgbm_params = {}
        for k, v in cfg.lgbm.model_params.items():
            if isinstance(v, ListConfig):
                lgbm_params[k] = list(v)
            else:
                lgbm_params[k] = v
        print(lgbm_params)

        if cfg.lgbm.optuna_tuning:
            best_params, tuning_hist = dict(), list()
            m_lgb = lgb.train(
                lgbm_params,
                train_data,
                valid_sets=[train_data, val_data],
                num_boost_round=cfg.lgbm.train_params.num_boost_round,
                early_stopping_rounds=cfg.lgbm.train_params.
                early_stopping_rounds,
                categorical_feature=cat_feats,
                verbose_eval=0,
                # feval=wrmsse,
                best_params=best_params,
                tuning_history=tuning_hist)

            print(best_params)
            print(tuning_hist)
        else:

            m_lgb = lgb.train(
                lgbm_params,
                train_data,
                # valid_sets=[train_data, val_data],
                valid_sets=[val_data],
                num_boost_round=cfg.lgbm.train_params.num_boost_round,
                early_stopping_rounds=cfg.lgbm.train_params.
                early_stopping_rounds,
                categorical_feature=cat_feats,
                verbose_eval=10,
            )
            # feval=wrmsse_evaluator.wrmsse_metric_lgbm)

            m_lgb.save_model(os.path.join(cwd,
                                          f"../result/fold{fold_idx}.lgb"))

            val_pred = m_lgb.predict(val_df[train_cols].values,
                                     num_iteration=m_lgb.best_iteration)

            # _, val_score, _ = wrmsse_evaluator.wrmsse_metric_lgbm(val_pred, val_df[train_cols])
            # print(f"VAL WRMSSE:{val_score}")

            # fold_val_scores[fold_idx] = val_score

            del val_df
            gc.collect()

    #     m_lgb.save_model(f"../result/targetencoding_fullmodel_fold{fold_idx}.lgb")
    # model_savepath = os.path.join(hydra.utils.get_original_cwd(), "../result/no_fe_fold{fold_idx}.lgb")
    # m_lgb.save_model(model_savepath)

    importance = pd.DataFrame(m_lgb.feature_importance(),
                              index=train_cols,
                              columns=['importance'
                                       ]).sort_values("importance",
                                                      inplace=False,
                                                      ascending=False)

    # importance.to_csv("")

    return m_lgb, fold_val_scores, train_cols
Example #23
0
if __name__ == "__main__":
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, val_x, train_y, val_y = train_test_split(data,
                                                      target,
                                                      test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dval = lgb.Dataset(val_x, label=val_y)
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    model = optuna_lgb.train(params,
                             dtrain,
                             valid_sets=[dtrain, dval],
                             verbose_eval=100,
                             early_stopping_rounds=100)

    prediction = np.rint(
        model.predict(val_x, num_iteration=model.best_iteration))
    accuracy = accuracy_score(val_y, prediction)

    best_params = model.params
    print("Best params:", best_params)
    print("  Accuracy = {}".format(accuracy))
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))
Example #24
0
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
                             cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
                             groups: Optional[pd.Series] = None,
                             time_budget: Optional[int] = None,
                             type_of_target: str = 'auto') -> Dict:
    """
    Search hyperparameter for lightgbm using optuna.

    Args:
        base_param:
            Base parameters passed to lgb.train.
        X:
            Training data.
        y:
            Target
        cv:
            int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
        groups:
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        time_budget:
            Time budget for tuning (in seconds).
        type_of_target:
            The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
            Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.

    Returns:
        The best parameters found
    """
    cv = check_cv(cv, y)

    if type_of_target == 'auto':
        type_of_target = multiclass.type_of_target(y)

    train_index, test_index = next(cv.split(X, y, groups))

    dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
    dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index])

    params = copy.deepcopy(base_param)
    if 'early_stopping_rounds' not in params:
        params['early_stopping_rounds'] = 100

    if not any([p in params for p in ('num_iterations', 'num_iteration',
                                      'num_trees', 'num_tree',
                                      'num_rounds', 'num_round')]):
        params['num_iterations'] = params.get('n_estimators', 10000)

    if 'objective' not in params:
        tot_to_objective = {
            'binary': 'binary',
            'continuous': 'regression',
            'multiclass': 'multiclass'
        }
        params['objective'] = tot_to_objective[type_of_target]

    if 'metric' not in params and 'objective' in params:
        if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root',
                                   'root_mean_squared_error', 'rmse']:
            params['metric'] = 'l2'
        if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']:
            params['metric'] = 'l1'
        if params['objective'] in ['binary']:
            params['metric'] = 'binary_logloss'
        if params['objective'] in ['multiclass']:
            params['metric'] = 'multi_logloss'

    if not any([p in params for p in ('verbose', 'verbosity')]):
        params['verbosity'] = -1

    best_params, tuning_history = dict(), list()
    optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0,
                     best_params=best_params, tuning_history=tuning_history, time_budget=time_budget)

    result_param = copy.deepcopy(base_param)
    result_param.update(best_params)
    return result_param
Example #25
0
  'task' : 'train',
  'boosting_type' : 'gbdt',
  'objective' : 'binary',
  'metric' : 'binary_logloss',
#  'num_class' : 3,
#  'learning_rate' : 0.1,
#  'num_leaves' : 23,
#  'min_data_in_leaf': 1,
#  'num_iteration': 100,
#  'verbose': 1
}

gbm = lgb.train(
  params,
  lgb_train,
#  num_boost_round=50,
  valid_sets=lgb_eval,
#  early_stopping_rounds=10
  )
y_pred = gbm.predict(test_X, num_iteration=gbm.best_iteration)
print("guess:")
for i in y_pred:
  print("\t", i )
#y_pred = np.argmax(y_pred, axis=1)
#for i in y_pred:
#  print("\t", i)

print("score:", roc_auc_score(test_y, y_pred))


main_data = pd.get_dummies(main_data)
Example #26
0
X_tr, X_val, y_tr, y_val = train_test_split(X_train,
                                            y_train,
                                            test_size=0.2,
                                            shuffle=True,
                                            random_state=20)
train_data = lgb.Dataset(X_tr, label=y_tr['domain2_var1'])
val_data = lgb.Dataset(X_val, label=y_val['domain2_var1'])
params = {
    'objective': 'fair',
    'metric': 'l1',
    'boosting_type': 'gbdt',
    'learning_rate': 0.003,
    'tree_learner': 'feature_parallel',
    'num_threads': 4,
    'seed': 0
}

best_params, tuning_history = dict(), list()

model = lgb.train(params,
                  train_data,
                  num_boost_round=100,
                  early_stopping_rounds=20,
                  valid_sets=[train_data, val_data],
                  verbose_eval=20,
                  learning_rates=lambda it: 0.01 * (0.8**it),
                  best_params=best_params,
                  tuning_history=tuning_history)

print("Best Params", best_params)
Example #27
0
def run(cfg):
    cwd = Path(hydra.utils.get_original_cwd())

    if cfg.base.optuna:
        import optuna.integration.lightgbm as lgb
    else:
        import lightgbm as lgb

    data = [pd.read_pickle(cwd / f"../features/{f}.pkl") for f in cfg.features]
    data = pd.concat(data, axis=1)

    train = data[data["train"]].drop(columns="train")
    test = data[~data["train"]].drop(columns=["train", "target"])
    target = train["target"]
    train = train.drop(columns="target")

    del data
    gc.collect()
    kfold = KFold(n_splits=cfg.base.n_folds,
                  shuffle=True,
                  random_state=cfg.base.seed)

    pred = np.zeros(test.shape[0])
    score = 0

    experiment_name = f"{'optuna_' if cfg.base.optuna else ''}{rand}"
    print("file:///" + hydra.utils.get_original_cwd() + "mlruns")
    mlflow.set_tracking_uri("file://" + hydra.utils.get_original_cwd() +
                            "/mlruns")

    use_cols = pd.Series(train.columns)
    use_cols.to_csv("features.csv", index=False, header=False)

    mlflow.lightgbm.autolog()
    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, target)):
        x_train, x_valid = train.loc[train_idx], train.loc[valid_idx]
        y_train, y_valid = target[train_idx], target[valid_idx]

        d_train = lgb.Dataset(x_train, label=y_train)
        d_valid = lgb.Dataset(x_valid, label=y_valid)
        del x_train
        del x_valid
        del y_train
        del y_valid
        gc.collect()
        mlflow.set_experiment(f"fold_{fold + 1}")

        with mlflow.start_run(run_name=f"{experiment_name}"):
            estimator = lgb.train(params=dict(cfg.parameters),
                                  train_set=d_train,
                                  num_boost_round=cfg.base.num_boost_round,
                                  valid_sets=[d_train, d_valid],
                                  verbose_eval=500,
                                  early_stopping_rounds=100)

            y_pred = estimator.predict(test)
            pred += y_pred / cfg.base.n_folds

            print(fold + 1, "done")

            score_ = estimator.best_score["valid_1"][cfg.base.metric]
            score += score_ / cfg.base.n_folds

            save_log({"score": score})
Example #28
0
params = {                           # Specify params that are fixed
           "objective": "binary",
           "metric": "binary_logloss",
           "verbosity": -1,
           "boosting_type": "gbdt",
          }


# 3.1 Note that unlike in sklearn, here there is
#     no instantiation of LightGBM model
#     Start modeling as also tuning hyperparameters

model = lgb.train(
                   params,                     # Just fixed params only
                   dtrain,                     # Dataset
                   valid_sets=[dtrain, dval],  # Evaluate performance on these datasets
                   verbose_eval=100,
                   early_stopping_rounds=100
                  )

### Model is ready

# 4.0 Make prediction
prediction = np.rint(
                     model.predict(
                                    val_x,    # Note that it is not lightgbm dataset
                                    num_iteration = model.best_iteration
                                    )
                    )
# 4.1 Determine accuracy
accuracy = accuracy_score(val_y, prediction)
    #            'random_state':33,'early_stopping_rounds':100,
    #            'min_data_per_group':5,'boosting_type':'gbdt','num_leaves':151,'max_dept':-1,
    #            'learning_rate':0.002, 'subsample_for_bin':200000,
    #            'min_split_gain':0.0, 'min_child_weight':0.001,
    #            'min_child_samples':20, 'subsample':1.0, 'subsample_freq':0,
    #            'colsample_bytree':.75, 'reg_alpha':1.3, 'reg_lambda':0.1,
    #            'n_jobs':- 1, 'cat_smooth':1.0,
    #            'silent':True, 'importance_type':'split','metric': 'auc'

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_test, label=y_test)

    boost = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        verbose_eval=100,
        early_stopping_rounds=100,
    )
    params = boost.params
    params
else:
    params = {
        'random_state': 33,
        'min_data_per_group': 5,
        'boosting_type': 'gbdt',
        'num_leaves': 125,
        'max_dept': -1,
        'max_bin': 63,
        'learning_rate': 0.01,
        'subsample_for_bin': 200000,
auc_list = []
y_list = []
pred_list = []

for fold_index, (train_index, test_index) in enumerate(skf.split(X, y)):

    lgb_train = lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
    lgb_eval = lgb.Dataset(X.iloc[test_index],
                           y.iloc[test_index],
                           reference=lgb_train)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=500,
        early_stopping_rounds=20,
        verbose_eval=50,
    )

    y_list.append(y.iloc[test_index].values.tolist())
    y_pred_test_pro = model.predict(X.iloc[test_index])
    pred_list.append(y_pred_test_pro.tolist())
    y_pred_test = np.rint(y_pred_test_pro)

    table = sklearn.metrics.confusion_matrix(y.iloc[test_index], y_pred_test)
    tn, fp, fn, tp = table[0][0], table[0][1], table[1][0], table[1][1]

    ACC.append((tp + tn) / (tp + fp + fn + tn))
    pre = tp / (tp + fp)
    PRE.append(pre)