Beispiel #1
0
def tune_params(
    base_param: Dict,
    X: pd.DataFrame,
    y: pd.Series,
    cv: BaseCrossValidator,
    time_budget: Optional[int] = None,
) -> Dict:
    train_index, test_index = next(cv.split(X, y))

    dtrain = lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[test_index], y.iloc[test_index])

    params = copy.deepcopy(base_param)
    if "early_stopping_rounds" not in params:
        params["early_stopping_rounds"] = 100

    best_params, tuning_history = dict(), list()
    lightgbm_tuner.train(
        params,
        dtrain,
        valid_sets=[dvalid],
        verbose_eval=0,
        best_params=best_params,
        tuning_history=tuning_history,
        time_budget=time_budget,
    )

    result_param = copy.deepcopy(base_param)
    result_param.update(best_params)
    return result_param
    def run(self):
        feature = (self.load("feature").query(
            f"store_id == '{self.store_id}'").reset_index(level=1))

        train = feature.query("date < '2016-03-28'").drop(
            columns=["store_id", "date"])
        valid = feature.query("'2016-03-28' <= date < '2016-04-25'").drop(
            columns=["store_id", "date"])
        test = feature.query("'2016-04-25' <= date").copy()

        dataset_train = lgb.Dataset(train.drop(columns="demand"),
                                    train["demand"])
        dataset_valid = lgb.Dataset(valid.drop(columns="demand"),
                                    valid["demand"])

        params = {
            "objective": "regression",
            "seed": 110,
            "learning_rate": 0.01,
            "boosting_type": "gbdt",
            "metric": "rmse",
            "lambda_l1": 0.0,
            "lambda_l2": 0.0,
            "num_leaves": 131,
            "feature_fraction": 0.41600000000000004,
            "bagging_fraction": 1.0,
            "bagging_freq": 0,
            "min_data_in_leaf": 20,
            "min_child_samples": 25,
        }
        print("hoge", self.time_budget)
        if not self.time_budget:
            print("tuningしないよ")
            model = lgb.train(
                params,
                dataset_train,
                num_boost_round=100000,
                valid_sets=[dataset_train, dataset_valid],
                early_stopping_rounds=200,
                verbose_eval=100,
            )
        else:
            print("tuningするよ")
            model = lightgbm_tuner.train(
                params,
                dataset_train,
                num_boost_round=100000,
                valid_sets=[dataset_train, dataset_valid],
                early_stopping_rounds=200,
                verbose_eval=-1,
                time_budget=self.time_budget,
            )

        predict = model.predict(
            test.drop(columns=["date", "store_id", "demand"]))
        test["demand"] = predict
        test = test.reset_index().set_index(["id", "date"])[["demand"]]

        result = ModelResult(model, params, test)
        self.dump(result)
Beispiel #3
0
def get_best_params(train_x: t.Any, train_y: t.Any, num_class: int) -> t.Any:
    tr_x, val_x, tr_y, val_y = train_test_split(train_x,
                                                train_y,
                                                test_size=0.2,
                                                random_state=1)
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(val_x, val_y)
    best_params = {}
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "num_class": num_class,
    }
    best_params = {}
    tuning_history = []
    gbm = lightgbm_tuner.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=10000,
        early_stopping_rounds=20,
        verbose_eval=10,
        best_params=best_params,
        tuning_history=tuning_history,
    )
    return best_params
def get_tuned_model(train_x, train_y, valid_x, valid_y, num_class) -> t.Any:
    # 学習用データセット
    train_set = lgb.Dataset(train_x, train_y)
    # 評価用データセット
    valid_set = lgb.Dataset(valid_x, valid_y)
    # チューニングしたハイパラを格納
    best_params = {}
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt', 
        'num_class': num_class,
        'num_threads': 2
    }
    best_params = {}
    tuning_history = []
    gbm = lightgbm_tuner.train(
        params,
        train_set,
        valid_sets=[train_set, valid_set],
        num_boost_round=10000,
        early_stopping_rounds=20,
        verbose_eval=10,
        best_params=best_params,
        tuning_history=tuning_history
    )
    joblib.dump(gbm, f"{DATA_DIR}/lgb_model.pkl")
    importance = pd.DataFrame(gbm.feature_importance(), index=train_x.columns, columns=['importance']).sort_values('importance', ascending=[False])
    print(importance)
    return gbm
Beispiel #5
0
def train_lgb(bst_params, fit_params, X, y, cv, drop_when_train=None):
    models = []

    if drop_when_train is None:
        drop_when_train = []

    for idx_fold, (idx_trn, idx_val) in enumerate(cv.split(X, y)):
        print(f"\n---------- Fold: ({idx_fold + 1} / {cv.get_n_splits()}) ----------\n")

        X_trn, X_val = X.iloc[idx_trn], X.iloc[idx_val]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        train_set = lgb.Dataset(X_trn.drop(drop_when_train, axis=1), label=y_trn)
        val_set = lgb.Dataset(X_val.drop(drop_when_train, axis=1), label=y_val)
        
        best_params, tuning_history = dict(), list()
        print("start")
        model = lightgbm_tuner.train(
            bst_params,
            train_set,
            valid_sets=[train_set, val_set],
            valid_names=["train", "valid"],
            **fit_params,
            #fobj = custom_asymmetric_train, 
            #feval = custom_asymmetric_valid,
            best_params=best_params,
            tuning_history=tuning_history
        )
        models.append(model)
        print(best_params)
        print(tuning_history)

        del idx_trn, idx_val, X_trn, X_val, y_trn, y_val
        gc.collect()

    return models
def get_best_params(train_x, train_y):
    tr_x, val_x, tr_y, val_y = train_test_split(train_x,
                                                train_y,
                                                test_size=0.2,
                                                random_state=1)
    tr_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y, reference=tr_set)
    best_params = {}
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
    }
    best_params = {}
    tuning_history = []
    gbm = lightgbm_tuner.train(
        params,
        tr_set,
        valid_sets=val_set,
        num_boost_round=1000,
        early_stopping_rounds=50,
        verbose_eval=10,
        best_params=best_params,
        tuning_history=tuning_history,
    )
    return best_params
Beispiel #7
0
    def fit(self,
            tr_x,
            tr_y,
            va_x=None,
            va_y=None,
            cat_features=None,
            feval=None):

        validation = va_x is not None
        lgb_train = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_features)
        if validation:
            lgb_eval = lgb.Dataset(va_x,
                                   va_y,
                                   reference=lgb_train,
                                   categorical_feature=cat_features)

        logger = logging.getLogger('main')

        best_params, tuning_history = dict(), list()
        if validation:
            self.model = lgb_tuner.train(self.params,
                                         lgb_train,
                                         num_boost_round=10000,
                                         valid_sets=[lgb_eval],
                                         verbose_eval=0,
                                         early_stopping_rounds=200,
                                         feval=feval,
                                         best_params=best_params,
                                         tuning_history=tuning_history)
        else:
            self.model = lgb_tuner.train(self.params,
                                         lgb_train,
                                         num_boost_round=10000,
                                         best_params=best_params,
                                         tuning_history=tuning_history)

        logging.debug('Best Params:', best_params)
        logging.debug('Tuning history:', tuning_history)
def main():

    # load dataset
    boston = datasets.load_boston()
    X, y = boston.data, boston.target

    # split train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test)

    # parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse'  # RMSE(Root Mean Square Error ,平均二乗誤差平方根)
    }

    # stepwise tuning of OPTUNA

    best_params = {}
    tuning_history = []

    # training , train_setは説明変数が入ってる。
    model = lightgbm_tuner.train(params=params,
                                 train_set=lgb_train,
                                 valid_sets=(lgb_train, lgb_eval),
                                 early_stopping_rounds=100,
                                 num_boost_round=10000,
                                 verbose_eval=50,
                                 best_params=best_params,
                                 tuning_history=tuning_history)

    # prediction
    y_pred = model.predict(data=X_test, num_iteration=model.best_iteration)

    # caluculate RMSE
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(rmse)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    lgb.plot_importance(model, height=0.5, ax=ax, figsize=(8, 10))
    plt.show()

    print(best_params)
    print(tuning_history)
Beispiel #9
0
    def train(self):

        lgb_train = lgb.Dataset(self.X_train, self.y_train)
        lgb_eval = lgb.Dataset(self.X_test, self.y_test)

        params = {
            'objective': 'multiclass',
            'num_class': self.ans_len,
            'metric': 'multi_error'
        }

        best_params = {}
        tuning_history = []

        model = lightgbm_tuner.train(
            params,
            lgb_train,
            valid_sets=lgb_eval,
            num_boost_round=self.num_boost_round,
            early_stopping_rounds=self.early_stopping_round,
            verbose_eval=False,
            best_params=best_params,
            tuning_history=tuning_history,
        )

        self.y_pred_prob = model.predict(self.X_test,
                                         num_iteration=model.best_iteration)
        self.y_pred = np.argmax(self.y_pred_prob, axis=1)

        df_pred = pd.DataFrame({
            'target': self.y_test[0],
            'target_pred': self.y_pred
        })
        print(df_pred)

        # df_pred_prob = pd.DataFrame({'y':self.y_test[0],
        #                              'target0_prob':self.y_pred_prob[:,0], 'target1_prob':self.y_pred_prob[:,1], 'target2_prob':self.y_pred_prob[:,2],
        #                              'target3_prob':self.y_pred_prob[:,3], 'target4_prob':self.y_pred_prob[:,4], 'target5_prob':self.y_pred_prob[:,5],
        #                              'target6_prob':self.y_pred_prob[:,6], 'target7_prob':self.y_pred_prob[:,7], 'target8_prob':self.y_pred_prob[:,8],
        #                              'target9_prob':self.y_pred_prob[:,9], 'target10_prob':self.y_pred_prob[:,10], 'target11_prob':self.y_pred_prob[:,11],
        #                              'target12_prob':self.y_pred_prob[:,12]})

        acc = accuracy_score(self.y_test, self.y_pred)
        print('Acc :', acc)
def get_best_params(train_x: t.Any, train_y: t.Any, valid_x: t.Any,
                    valid_y: t.Any) -> t.Any:
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
    best_params = {}
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
    }
    best_params = {}
    tuning_history = []
    gbm = lightgbm_tuner.train(params,
                               lgb_train,
                               valid_sets=lgb_eval,
                               num_boost_round=10000,
                               early_stopping_rounds=20,
                               verbose_eval=10,
                               best_params=best_params,
                               tuning_history=tuning_history)
    return best_params
def tune_lgb(
    X: Union[pd.DataFrame, np.array], y: Union[pd.Series, np.array], seed: int,
):
    """lightgbmを用いてfoldごとのモデルを返す。

    Args:
        X (Union[pd.DataFrame, np.array]): 特徴行列
        y (Union[pd.Series, np.array]): 目的変数

    Returns:
        List[lgb.Booster]: Boosterのリスト
    """
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    best_params, tuning_history = dict(), list()

    train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dval = lgb.Dataset(val_x, label=val_y)

    model = lightgbm_tuner.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        best_params=best_params,
        tuning_history=tuning_history,
        num_boost_round=1000000,
        verbose_eval=100,
        early_stopping_rounds=100,
    )

    prediction = model.predict(val_x, num_iteration=model.best_iteration)
    auc = roc_auc_score(val_y, prediction)
    print("AUC: ", auc)
    return {"best_params": best_params, "history": tuning_history}
Beispiel #12
0
def get_best_params(train_x: t.Any, train_y: t.Any) -> t.Any:
    tr_x, val_x, tr_y, val_y = train_test_split(train_x,
                                                train_y,
                                                test_size=0.2,
                                                random_state=1)
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(val_x, val_y)
    best_params = {}
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
    }
    best_params = {}
    tuning_history = []
    gbm = lightgbm_tuner.train(params,
                               lgb_train,
                               valid_sets=lgb_eval,
                               num_boost_round=1000,
                               early_stopping_rounds=20,
                               verbose_eval=50,
                               best_params=best_params,
                               tuning_history=tuning_history)
    return best_params
Beispiel #13
0
    def run(self):
        tuning = False
        train = self.load_data_frame("train")
        valid = self.load_data_frame("valid")
        test = self.load_data_frame("test")

        # test期間の何日目のモデルかというのとdのmap
        shift_day_map = {
            i + 1: d
            for (i, d) in enumerate(test.reset_index()["d"].unique())
        }
        # test のdが何曜日かというmap
        dow_map = test.drop_duplicates("d").set_index(
            "d")["dayofweek"].to_dict()

        d = shift_day_map[self.target_day]
        dow = dow_map[d]
        valid = valid.query(f"dayofweek == {dow}")
        test = test.query(f"d == {d}")

        common_columns = [
            "sell_price",
            "lag_t28",
            "lag_t29",
            "lag_t30",
            "rolling_mean_t7",
            "rolling_mean_t30",
            "rolling_std_t30",
            "rolling_skew_t30",
            "rolling_kurt_t30",
            "rolling_mean_t60",
            "rolling_mean_t90",
            "rolling_std_t90",
            "rolling_mean_t180",
            "rolling_std_t180",
            "price_change_t1",
            "price_change_t365",
            "rolling_price_std_t7",
            "rolling_price_std_t30",
            "snap_CA",
            "snap_TX",
            "snap_WI",
            "wm_yr_wk",
            "dayofweek",
            # "ratio_by_store_t7",
            # "ratio_by_store_t30",
            # "ratio_by_item_t7",
            # "ratio_by_item_t30",
            "is_weekend",
            "is_US_holiday",
            "before_day_off",
            "after_day_off",
            # "lag_sales_mul_lag_price_dev_price",
            # "item_predict",
        ]

        id_columns = [
            "item_id",
            "dept_id",
            "cat_id",
            "store_id",
            "state_id",
            "event_name_1",
            "event_type_1",
            "event_name_2",
            "event_type_2",
        ]

        if not tuning:
            try:
                params: dict = json.load(
                    open(
                        f"./model_params/params_day_{str(self.target_day).zfill(2)}.json"
                    ))
            except FileNotFoundError as e:
                print("not found tuned parameter!", e)
                params = {
                    "boosting_type": "gbdt",
                    "metric": "rmse",
                    "objective": "poisson",
                    "n_jobs": -1,
                    "seed": 110,
                    "learning_rate": 0.05,
                    "bagging_fraction": 0.75,
                    "bagging_freq": 10,
                    "colsample_bytree": 0.75,
                }
        else:
            params = {
                "boosting_type": "gbdt",
                "metric": "rmse",
                "objective": "poisson",
                "n_jobs": -1,
                "seed": 110,
                "learning_rate": 0.05,
            }

        if self.target_day < TEST_DAYS:  # 28日目のときはshift_columnsがないから
            feature_columns = train.columns
            shift_column_names = [
                f"shift_{day}" for day in range(self.target_day, TEST_DAYS)
            ]
            # 全部使うと多すぎるので最新と7日ごとのものを使う
            shift_column_names = list(
                set(shift_column_names)
                & set([shift_column_names[0]] +
                      ["shift_7", "shift_14", "shift_21"]))

            shift_columns = []
            for col in shift_column_names:
                shift_columns += feature_columns[
                    feature_columns.str.startswith(col)].tolist()
            use_columns = common_columns + id_columns + shift_columns
        elif self.target_day == TEST_DAYS:  # 28日目なら
            use_columns = common_columns + id_columns
        else:
            raise ValueError("something wrong when make use_columns")

        dataset_train = lgb.Dataset(train[use_columns], train[TARGET])
        dataset_valid = lgb.Dataset(valid[use_columns], valid[TARGET])

        if tuning:
            model = lightgbm_tuner.train(
                params,
                dataset_train,
                num_boost_round=1000,
                valid_sets=[dataset_train, dataset_valid],
                early_stopping_rounds=200,
                verbose_eval=100,
                time_budget=3600,
            )
            json.dump(
                model.params,
                open(
                    f"./model_params/params_day_{str(self.target_day).zfill(2)}.json",
                    "w",
                ),
                indent=4,
            )
        else:
            model = lgb.train(
                params,
                dataset_train,
                num_boost_round=1000,
                valid_sets=[dataset_train, dataset_valid],
                early_stopping_rounds=200,
                verbose_eval=100,
            )

        valid["pred"] = model.predict(valid[use_columns])
        test[TARGET] = model.predict(test[use_columns])

        importance = pd.DataFrame({
            "feature_name":
            model.feature_name(),
            "importance":
            model.feature_importance("gain"),
        })
        result = Result(valid[["d", TARGET, "pred"]], test[["d", TARGET]],
                        importance)

        self.dump(result)