Ejemplo n.º 1
0
def test_feature_importance_with_limit(tmpdir, limit):
    features = ["a", "b", "c"]
    importances = [1, 2, 3]
    importance_type = "gain"
    fig = mplt.feature_importance(features, importances, importance_type,
                                  limit)
    assert_is_figure(fig)
Ejemplo n.º 2
0
def test_feature_importance_with_limit(tmpdir: py.path.local,
                                       limit: int) -> None:
    features = ["a", "b", "c"]
    importances = [1, 2, 3]
    importance_type = "gain"
    fig = mplt.feature_importance(features, importances, importance_type,
                                  limit)
    assert_is_figure(fig)
Ejemplo n.º 3
0
def test_feature_importance_with_normalize(tmpdir: py.path.local) -> None:
    features = ["a", "b", "c"]
    importances = [1, 2, 3]
    importance_type = "gain"
    fig = mplt.feature_importance(features,
                                  importances,
                                  importance_type,
                                  normalize=True)
    assert_is_figure(fig)
Ejemplo n.º 4
0
def log_feature_importance(
    features: ArrayLike,
    importances: ArrayLike,
    importance_type: str,
    limit: Optional[int] = None,
    normalize: bool = False,
    path: str = "feature_importance.png",
) -> None:
    """
    Log feature importance as an artifact.

    Parameters
    ----------
    features : array-like
        Feature names.
    importances : array-like
        Importance of each feature.
    importance_type : str
        Importance type (e.g. "gain").
    path : str, default "feature_importance.png"
        Path in the artifact store.
    **kwargs : dict
        Keyword arguments passed to mlflow.plotting.feature_importance.

    Returns
    -------
    None
        None

    Examples
    --------
    >>> with mlflow.start_run() as run:
    ...     features = ['a', 'b', 'c']
    ...     importances = [1, 2, 3]
    ...     mlflow.log_feature_importance(features, importances, 'gain')
    >>> list_artifacts(run.info.run_id)
    ['feature_importance.png']

    """
    fig = mplt.feature_importance(features, importances, importance_type,
                                  limit, normalize)
    log_figure(fig, path)
Ejemplo n.º 5
0
def main():
    calendar, prices, sales, submission = read_data()
    num_items = sales.shape[0]
    pred_days = submission.shape[1] - 1  # 28

    # encoder calendar and sales cols into numeric
    calendar = encoder_category(
        calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"
                   ]).pipe(reduce_mem_usage, verbose)
    sales = encoder_category(
        sales,
        ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
    ).pipe(reduce_mem_usage, verbose)
    prices = encoder_category(prices,
                              ["item_id", "store_id"]).pipe(reduce_mem_usage, verbose)

    data = reshape_sales(sales, submission, pred_days, d_thresh=1941 -
                         int(365 * 2), verbose=False)  # d_thresh why is this
    del sales

    calendar["d"] = extract_num(calendar["d"])
    data = merge_calendar(data, calendar)
    data = merge_prices(data, prices)
    del calendar, prices
    gc.collect()
    data = reduce_mem_usage(data)
    data = add_demand_features(data, pred_days).pipe(reduce_mem_usage)
    data = add_price_features(data).pipe(reduce_mem_usage)
    dt_col = "date"
    data = add_time_features(data, dt_col).pipe(reduce_mem_usage)
    data = data.sort_values("date")

    print("start date:", data[dt_col].min())
    print("end date:", data[dt_col].max())
    print("data shape:", data.shape)

    # stage 2
    day_col = "d"
    cv_params = {"n_splits": 3, "train_days": int(
        365 * 1.5), "test_days": pred_days, "day_col": day_col, "pred_days": pred_days}
    cv = CustomTimeSeriesSpliter(**cv_params)
    sample = data.iloc[::1000][[day_col, dt_col]].reset_index(drop=True)
    show_cv_days(cv, sample, dt_col, day_col)
    plot_cv_indices(cv, sample, dt_col)
    del sample
    gc.collect()

    features = [
        "item_id",
        "dept_id",
        "cat_id",
        "store_id",
        "state_id",
        "event_name_1",
        "event_type_1",
        "event_name_2",
        "event_type_2",
        "snap_CA",
        "snap_TX",
        "snap_WI",
        "sell_price",
        # demand features
        "shift_t28",
        "shift_t29",
        "shift_t30",
        # std
        "rolling_std_t7",
        "rolling_std_t30",
        "rolling_std_t60",
        "rolling_std_t90",
        "rolling_std_t180",
        # mean
        "rolling_mean_t7",
        "rolling_mean_t30",
        "rolling_mean_t60",
        "rolling_mean_t90",
        "rolling_mean_t180",
        # min
        "rolling_min_t7",
        "rolling_min_t30",
        "rolling_min_t60",
        # max
        "rolling_max_t7",
        "rolling_max_t30",
        "rolling_max_t60",
        # others
        "rolling_skew_t30",
        "rolling_kurt_t30",
        # price features
        "price_change_t1",
        "price_change_t365",
        "rolling_price_std_t7",
        "rolling_price_std_t30",
        # time features
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
        "is_weekend",
    ]
    is_train = data["d"] < 1914

    # Attach "d" to X_train for cross validation.
    x_train = data[is_train][[day_col] + features].reset_index(drop=True)
    y_train = data[is_train]["demand"].reset_index(drop=True)
    x_test = data[~is_train][features].reset_index(drop=True)
    # keep these two columns to use later.
    id_date = data[~is_train][["id", "date"]].reset_index(drop=True)
    del data
    gc.collect()

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    bst_params = {"boosting_type": "gbdt", "metric": "rmse",
                  "objective": "regression", "n_jobs": -1, "seed": 42, "learning_rate": 0.1,
                  "bagging_fraction": 0.75, "bagging_freq": 10, "colsample_bytree": 0.75}
    fit_params = {"num_boost_round": 100_000,
                  "early_stopping_rounds": 50, "verbose_eval": 100}
    models = train_lgb(bst_params, fit_params, x_train,
                       y_train, cv, drop_when_train=[day_col])
    del x_train, y_train
    gc.collect()

    imp_type = "gain"
    importances = np.zeros(x_test.shape[1])
    preds = np.zeros(x_test.shape[0])
    for model in models:
        preds += model.predict(x_test)
        importances += model.feature_importance(imp_type)
    preds = preds / cv.get_splits()
    importances = importances / cv.get_splits()

    with mlflow.start_run():
        mlflow.log_params_flatten(
            {"bst": bst_params, "fit": fit_params, "cv": cv_params})

    features = models[0].feature_name()
    fig = mplt.feature_importance(features, importances, imp_type, limit=30)
    plt.show()

    make_submission(id_date.assign(demand=preds), submission, pred_days)
Ejemplo n.º 6
0
preds = preds / cv.get_n_splits()
importances = importances / cv.get_n_splits()

#%%
from mlflow_extend import mlflow, plotting as mplt

with mlflow.start_run():
    mlflow.log_params_flatten({
        "bst": bst_params,
        "fit": fit_params,
        "cv": cv_params
    })

features = models[0].feature_name()
_ = mplt.feature_importance(features, importances, imp_type, limit=30)

#%%


def make_submission(test, submission):
    preds = test[["id", "date", "demand"]]
    preds = preds.pivot(index="id", columns="date",
                        values="demand").reset_index()
    preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)]

    evals = submission[submission["id"].str.endswith("evaluation")]
    vals = submission[["id"]].merge(preds, how="inner", on="id")
    final = pd.concat([vals, evals])

    assert final.drop("id", axis=1).isnull().sum().sum() == 0