def test_feature_importance_with_limit(tmpdir, limit): features = ["a", "b", "c"] importances = [1, 2, 3] importance_type = "gain" fig = mplt.feature_importance(features, importances, importance_type, limit) assert_is_figure(fig)
def test_feature_importance_with_limit(tmpdir: py.path.local, limit: int) -> None: features = ["a", "b", "c"] importances = [1, 2, 3] importance_type = "gain" fig = mplt.feature_importance(features, importances, importance_type, limit) assert_is_figure(fig)
def test_feature_importance_with_normalize(tmpdir: py.path.local) -> None: features = ["a", "b", "c"] importances = [1, 2, 3] importance_type = "gain" fig = mplt.feature_importance(features, importances, importance_type, normalize=True) assert_is_figure(fig)
def log_feature_importance( features: ArrayLike, importances: ArrayLike, importance_type: str, limit: Optional[int] = None, normalize: bool = False, path: str = "feature_importance.png", ) -> None: """ Log feature importance as an artifact. Parameters ---------- features : array-like Feature names. importances : array-like Importance of each feature. importance_type : str Importance type (e.g. "gain"). path : str, default "feature_importance.png" Path in the artifact store. **kwargs : dict Keyword arguments passed to mlflow.plotting.feature_importance. Returns ------- None None Examples -------- >>> with mlflow.start_run() as run: ... features = ['a', 'b', 'c'] ... importances = [1, 2, 3] ... mlflow.log_feature_importance(features, importances, 'gain') >>> list_artifacts(run.info.run_id) ['feature_importance.png'] """ fig = mplt.feature_importance(features, importances, importance_type, limit, normalize) log_figure(fig, path)
def main(): calendar, prices, sales, submission = read_data() num_items = sales.shape[0] pred_days = submission.shape[1] - 1 # 28 # encoder calendar and sales cols into numeric calendar = encoder_category( calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2" ]).pipe(reduce_mem_usage, verbose) sales = encoder_category( sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"], ).pipe(reduce_mem_usage, verbose) prices = encoder_category(prices, ["item_id", "store_id"]).pipe(reduce_mem_usage, verbose) data = reshape_sales(sales, submission, pred_days, d_thresh=1941 - int(365 * 2), verbose=False) # d_thresh why is this del sales calendar["d"] = extract_num(calendar["d"]) data = merge_calendar(data, calendar) data = merge_prices(data, prices) del calendar, prices gc.collect() data = reduce_mem_usage(data) data = add_demand_features(data, pred_days).pipe(reduce_mem_usage) data = add_price_features(data).pipe(reduce_mem_usage) dt_col = "date" data = add_time_features(data, dt_col).pipe(reduce_mem_usage) data = data.sort_values("date") print("start date:", data[dt_col].min()) print("end date:", data[dt_col].max()) print("data shape:", data.shape) # stage 2 day_col = "d" cv_params = {"n_splits": 3, "train_days": int( 365 * 1.5), "test_days": pred_days, "day_col": day_col, "pred_days": pred_days} cv = CustomTimeSeriesSpliter(**cv_params) sample = data.iloc[::1000][[day_col, dt_col]].reset_index(drop=True) show_cv_days(cv, sample, dt_col, day_col) plot_cv_indices(cv, sample, dt_col) del sample gc.collect() features = [ "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_type_1", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI", "sell_price", # demand features "shift_t28", "shift_t29", "shift_t30", # std "rolling_std_t7", "rolling_std_t30", "rolling_std_t60", "rolling_std_t90", "rolling_std_t180", # mean "rolling_mean_t7", "rolling_mean_t30", "rolling_mean_t60", "rolling_mean_t90", "rolling_mean_t180", # min "rolling_min_t7", "rolling_min_t30", "rolling_min_t60", # max "rolling_max_t7", "rolling_max_t30", "rolling_max_t60", # others "rolling_skew_t30", "rolling_kurt_t30", # price features "price_change_t1", "price_change_t365", "rolling_price_std_t7", "rolling_price_std_t30", # time features "year", "quarter", "month", "week", "day", "dayofweek", "is_weekend", ] is_train = data["d"] < 1914 # Attach "d" to X_train for cross validation. x_train = data[is_train][[day_col] + features].reset_index(drop=True) y_train = data[is_train]["demand"].reset_index(drop=True) x_test = data[~is_train][features].reset_index(drop=True) # keep these two columns to use later. id_date = data[~is_train][["id", "date"]].reset_index(drop=True) del data gc.collect() print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) bst_params = {"boosting_type": "gbdt", "metric": "rmse", "objective": "regression", "n_jobs": -1, "seed": 42, "learning_rate": 0.1, "bagging_fraction": 0.75, "bagging_freq": 10, "colsample_bytree": 0.75} fit_params = {"num_boost_round": 100_000, "early_stopping_rounds": 50, "verbose_eval": 100} models = train_lgb(bst_params, fit_params, x_train, y_train, cv, drop_when_train=[day_col]) del x_train, y_train gc.collect() imp_type = "gain" importances = np.zeros(x_test.shape[1]) preds = np.zeros(x_test.shape[0]) for model in models: preds += model.predict(x_test) importances += model.feature_importance(imp_type) preds = preds / cv.get_splits() importances = importances / cv.get_splits() with mlflow.start_run(): mlflow.log_params_flatten( {"bst": bst_params, "fit": fit_params, "cv": cv_params}) features = models[0].feature_name() fig = mplt.feature_importance(features, importances, imp_type, limit=30) plt.show() make_submission(id_date.assign(demand=preds), submission, pred_days)
preds = preds / cv.get_n_splits() importances = importances / cv.get_n_splits() #%% from mlflow_extend import mlflow, plotting as mplt with mlflow.start_run(): mlflow.log_params_flatten({ "bst": bst_params, "fit": fit_params, "cv": cv_params }) features = models[0].feature_name() _ = mplt.feature_importance(features, importances, imp_type, limit=30) #%% def make_submission(test, submission): preds = test[["id", "date", "demand"]] preds = preds.pivot(index="id", columns="date", values="demand").reset_index() preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)] evals = submission[submission["id"].str.endswith("evaluation")] vals = submission[["id"]].merge(preds, how="inner", on="id") final = pd.concat([vals, evals]) assert final.drop("id", axis=1).isnull().sum().sum() == 0