Ejemplo n.º 1
0
def stack_catboost():
    """Стекинг catboost."""
    x_train = add_stacking_feat(load_oof())
    _, y_train = processing.train_set()
    x_test = add_stacking_feat(load_sub())
    x_test.columns = x_train.columns

    x_train.drop(DROP, axis=1, inplace=True)
    x_test.drop(DROP, axis=1, inplace=True)

    pool_test = catboost.Pool(data=x_test,
                              label=None,
                              cat_features=None,
                              weight=None)
    y_oof = pd.Series(0, index=x_train.index, name="oof_y")
    y_pred = pd.Series(0, index=x_test.index, name="time_to_failure")
    trees = []
    scores = []
    feat_importance = 0

    for index_train, index_valid in K_FOLDS.split(x_train):
        pool_train = catboost.Pool(data=x_train.iloc[index_train],
                                   label=y_train.iloc[index_train],
                                   cat_features=None,
                                   weight=None)
        pool_valid = catboost.Pool(data=x_train.iloc[index_valid],
                                   label=y_train.iloc[index_valid],
                                   cat_features=None,
                                   weight=None)
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation_0']['MAE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / K_FOLDS.get_n_splits()
        feat_importance += pd.DataFrame(
            clf.get_feature_importance(prettified=True),
            columns=["name", "value"
                     ]).set_index("name") / K_FOLDS.get_n_splits()

    LOGGER.info(f"Количество деревьев: {sorted(trees)}")
    LOGGER.info(
        f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}"
    )
    LOGGER.info(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    LOGGER.info(
        f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}")

    stamp = (
        f"{time.strftime('%Y-%m-%d_%H-%M')}_"
        f"{np.mean(scores):0.3f}_"
        f"{np.mean(scores) + np.std(scores) * 2 / len(scores) ** 0.5:0.3f}_stk"
    )
    y_oof.to_csv(conf.DATA_PROCESSED + f"oof_{stamp}.csv", header=True)
    y_pred.to_csv(conf.DATA_PROCESSED + f"sub_{stamp}.csv", header=True)
    print(feat_importance.sort_values("value", ascending=False))
def train_light_gbm():
    """Обучение LightGBM RF."""
    x_train, y_train = processing.train_set()
    x_test = processing.test_set()

    x_train.drop(DROP, axis=1, inplace=True)
    x_test.drop(DROP, axis=1, inplace=True)

    y_oof = pd.Series(0, index=x_train.index, name="oof_lgbm")
    y_pred = pd.Series(0, index=x_test.index, name="time_to_failure")
    trees = []
    scores = []
    feat_importance = 0

    for index_train, index_valid in K_FOLDS.split(x_train):
        pool_train = lgb.Dataset(
            x_train.iloc[index_train],
            label=y_train.iloc[index_train],
        )
        pool_valid = lgb.Dataset(
            x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
        )
        clf = lgb.train(
            CLF_PARAMS,
            pool_train,
            ITERATIONS,
            valid_sets=[pool_train, pool_valid],
            verbose_eval=ITERATIONS // 100,
            early_stopping_rounds=ITERATIONS // 10
        )

        trees.append(clf.best_iteration)
        scores.append(clf.best_score["valid_1"]["l1"])

        y_oof.iloc[index_valid] = clf.predict(x_train.iloc[index_valid], num_iteration=clf.best_iteration)
        y_pred += clf.predict(x_test, num_iteration=clf.best_iteration) / K_FOLDS.get_n_splits()

        feat_importance += clf.feature_importance("gain") / K_FOLDS.get_n_splits()
        print("\n")

    LOGGER.info(f"Количество деревьев: {sorted(trees)}")
    LOGGER.info(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    LOGGER.info(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    LOGGER.info(f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}")

    stamp = (
        f"{time.strftime('%Y-%m-%d_%H-%M')}_"
        f"{np.mean(scores):0.3f}_"
        f"{np.mean(scores) + np.std(scores) * 2 / len(scores) ** 0.5:0.3f}_lgbm")
    y_oof.to_csv(conf.DATA_PROCESSED + f"oof_{stamp}.csv", header=True)
    y_pred.to_csv(conf.DATA_PROCESSED + f"sub_{stamp}.csv", header=True)
    print(pd.DataFrame(feat_importance, index=x_train.columns, columns=["value"]).sort_values("value", ascending=False))
def train_ext():
    """Обучение LightGBM."""
    x_train, y_train = processing.train_set()
    x_test = processing.test_set()

    x_train.drop(DROP, axis=1, inplace=True)
    x_test.drop(DROP, axis=1, inplace=True)

    y_oof = pd.Series(0, index=x_train.index, name="oof_ext")
    y_pred = pd.Series(0, index=x_test.index, name="time_to_failure")
    feat_importance = 0
    scores = []

    for index_train, index_valid in K_FOLDS.split(x_train):

        clf = ensemble.ExtraTreesRegressor(**CLF_PARAMS)
        clf.fit(x_train.iloc[index_train], y_train.iloc[index_train])

        y_oof.iloc[index_valid] = clf.predict(x_train.iloc[index_valid])

        scores.append(
            metrics.mean_absolute_error(y_train.iloc[index_valid],
                                        y_oof.iloc[index_valid]))

        y_pred += clf.predict(x_test) / K_FOLDS.get_n_splits()

        feat_importance += clf.feature_importances_ / K_FOLDS.get_n_splits()
        print("\n")

    LOGGER.info(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    LOGGER.info(
        f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}")

    stamp = (
        f"{time.strftime('%Y-%m-%d_%H-%M')}_"
        f"{np.mean(scores):0.3f}_"
        f"{np.mean(scores) + np.std(scores) * 2 / len(scores) ** 0.5:0.3f}_ext"
    )
    y_oof.to_csv(conf.DATA_PROCESSED + f"oof_{stamp}.csv", header=True)
    y_pred.to_csv(conf.DATA_PROCESSED + f"sub_{stamp}.csv", header=True)
    print(
        pd.DataFrame(feat_importance, index=x_train.columns,
                     columns=["value"]).sort_values("value", ascending=False))