Example #1
0
    def run(self):
        d = RawData()

        with timer("load calendar.csv"):
            d.calendar = pd.read_csv("./m5-forecasting-accuracy/calendar.csv").pipe(
                reduce_mem_usage
            )

        with timer("load sales_train_validation.csv"):
            d.sales_train_validation = pd.read_csv(
                "./m5-forecasting-accuracy/sales_train_evaluation.csv"
            ).pipe(reduce_mem_usage)

        # with timer("convert christmas data to rmean"):
        #     for d_str in d.calendar[d.calendar["date"].isin(events.christmas_dates)][
        #         "d"
        #     ]:
        #         d_int = int(d_str.replace("d_", ""))
        #         d.sales_train_validation[d_str] = d.sales_train_validation[
        #             [f"d_{i}" for i in range(d_int - 15, d_int + 15) if i != d_int]
        #         ].apply(lambda row: row.mean(), axis=1)

        with timer("load sample_submission.csv"):
            d.sample_submission = pd.read_csv(
                "./m5-forecasting-accuracy/sample_submission.csv"
            ).pipe(reduce_mem_usage)

        with timer("load sell_prices.csv"):
            d.sell_prices = pd.read_csv(
                "./m5-forecasting-accuracy/sell_prices.csv"
            ).pipe(reduce_mem_usage)

        self.dump(d)
 def run(self):
     with timer("combine features"):
         with timer("concat features"):
             data: pd.DataFrame = pd.concat(
                 [
                     self.load("data"),
                     self.load("fe_price_rolling"),
                     self.load("fe_price_change"),
                     self.load("fe_price_basic"),
                     self.load("fe_shift"),
                     self.load("fe_rolling_mean"),
                     self.load("fe_rolling_dw_mean"),
                     self.load("fe_rolling_group_mean"),
                     self.load("fe_rolling_group_std"),
                     self.load("fe_rolling_std"),
                     self.load("fe_rolling_skew"),
                     self.load("fe_rolling_kurt"),
                     self.load("te_data"),
                     self.load("fe_catch22_pca"),
                     self.load("fe_weather"),
                     self.load("fe_unemployment"),
                     self.load("fe_stock"),
                     self.load("fe_event"),
                     self.load("fe_event_strength"),
                 ],
                 axis=1,
             )
         sp_idx: SplitIndex = self.load("sp_idx")
         sp: Split = Split()
         sp.train = data.iloc[sp_idx.train, :]
         sp.test = data.iloc[sp_idx.test, :]
         print(sp.train.info())
     self.dump(sp)
Example #3
0
 def run(self):
     config = Config()
     data: pd.DataFrame = pd.concat(
         [self.load("data"), self.load("fe_event")], axis=1)
     train_df: pd.DataFrame = data[(data.d > config.START_DAY)
                                   & (data.d <= 1913)]
     # train_df = train_df.sample(int(len(train_df) * 0.15))
     with timer("create grouped df"):
         grouped: List[Tuple[List[str],
                             pd.DataFrame]] = target_encoding(train_df)
     with timer("merge into data"):
         for group_key, grouped_df in tqdm(grouped):
             data = data.merge(grouped_df, on=group_key, how="left")
         df = reduce_mem_usage(data.filter(like="fe_te_"))
         print(df.info())
     self.dump(df)
Example #4
0
    def run(self):
        raw: RawData = self.load("raw")
        data: pd.DataFrame = self.load("data")

        raw.calendar["d"] = raw.calendar["d"].map(
            lambda d: int(d.replace("d_", "")))
        raw.calendar["date_time"] = raw.calendar["date"]

        weather = read_weather_data()
        weather = weather[[
            "date_time",
            "state_id",
            "fe_weather_mintempC",
            "fe_weather_maxtempC",
            "fe_weather_humidity",
            "fe_weather_sunHour",
            "fe_weather_cloudcover",
        ]]

        with timer("merge data"):
            data = data.merge(raw.calendar[["d", "date_time"]],
                              on="d",
                              how="left")
            data = data.merge(weather,
                              on=["date_time", "state_id"],
                              how="left")

        df = data.filter(like="fe_weather_")
        print(df.info())

        self.dump(df)
Example #5
0
def read_weather_data(
        external_data_path: str = "./external_data") -> pd.DataFrame:
    files: Dict[str, int] = {
        "californiaw.csv": 0,
        "texasw.csv": 1,
        "wisconsinw.csv": 2,
    }

    weather = pd.DataFrame()
    with timer("load weather data"):
        if os.path.exists(f"{external_data_path}/weather"):
            for file_name, state_id in files.items():
                _tmp_weather = pd.read_csv(
                    f"{external_data_path}/weather/{file_name}")
                _tmp_weather["state_id"] = state_id
                _tmp_weather["date_time"] = pd.to_datetime(
                    _tmp_weather["date_time"]).dt.strftime("%Y-%m-%d")
                weather = pd.concat([weather, _tmp_weather], axis=0)
                del _tmp_weather
            weather.columns = [
                f"fe_weather_{col}"
                if col not in ["date_time", "state_id"] else col
                for col in weather.columns
            ]
            print(weather.columns)
    return weather
Example #6
0
    def run(self):
        data: pd.DataFrame = self.load()

        with timer("make rolling_price_std_t7"):
            data["fe_rolling_price_std_t7"] = (data.groupby([
                "id"
            ])["sell_price"].transform(lambda x: x.rolling(7).std()).astype(
                np.float16))

        with timer("make rolling_price_std_t30"):
            data["fe_rolling_price_std_t30"] = (data.groupby([
                "id"
            ])["sell_price"].transform(lambda x: x.rolling(30).std()).astype(
                np.float16))

        df = data.filter(like="fe_rolling_price")
        print(df.info())
        self.dump(df)
 def run(self):
     with timer("combine val features"):
         with timer("concat features"):
             data: pd.DataFrame = pd.concat(
                 [
                     self.load("data"),
                     self.load("fe_price_rolling"),
                     self.load("fe_price_change"),
                     self.load("fe_price_basic"),
                     self.load("fe_shift"),
                     self.load("fe_rolling_mean"),
                     self.load("fe_rolling_dw_mean"),
                     self.load("fe_rolling_group_mean"),
                     self.load("fe_rolling_group_std"),
                     self.load("fe_rolling_std"),
                     self.load("fe_rolling_skew"),
                     self.load("fe_rolling_kurt"),
                     self.load("fe_weather"),
                     self.load("fe_unemployment"),
                     self.load("fe_stock"),
                     self.load("fe_event"),
                     self.load("fe_event_strength"),
                     self.load("fe_catch22_pca"),
                 ],
                 axis=1,
             )
         with timer("merge target features"):
             config = Config()
             te_val_data: List[pd.DataFrame] = self.load("te_val_data")
             splits: List[Split] = []
             sp_idxs: List[SplitIndex] = self.load("sp_idxs")
             for i in tqdm(range(len(sp_idxs))):
                 sp: Split = Split()
                 data = pd.concat([data, te_val_data[i]], axis=1)
                 sp.train = data.iloc[sp_idxs[i].train, :]
                 sp.test = data.iloc[sp_idxs[i].test, :]
                 if config.CV_SAMPLE_RATE != 1:
                     sp.train = sp.train.sample(
                         int(len(sp.train) * config.CV_SAMPLE_RATE))
                 splits.append(sp)
                 print(sp.train.info())
                 data = data.drop(list(data.filter(like="fe_te_").columns),
                                  axis=1)
     self.dump(splits)
Example #8
0
 def run(self):
     data: pd.DataFrame = self.load()
     with timer("make shift features"):
         for days in tqdm(list(range(5, 9)) + list(range(28, 43))):
             data[f"shift_t{days}"] = (
                 data.groupby(["id"])["sales"]
                 .transform(lambda x: x.shift(days))
                 .astype(np.float16)
             )
     df = data.filter(like="shift_t")
     print(df.info())
     self.dump(df)
Example #9
0
 def run(self):
     config = Config()
     data: pd.DataFrame = pd.concat(
         [self.load("data"), self.load("fe_event")], axis=1)
     dfs: List[pd.DataFrame] = []
     for end_day in config.CV_START_DAYS:
         with timer("create grouped df"):
             # train_df: pd.DataFrame = data[
             #     (data.d > config.START_DAY) & (data.d < end_day)
             # ]
             train_df: pd.DataFrame = data[data.d < end_day]
             grouped: List[Tuple[List[str],
                                 pd.DataFrame]] = target_encoding(train_df)
         with timer("merge into data"):
             df = data.copy()
             for group_key, grouped_df in tqdm(grouped):
                 df = df.merge(grouped_df, on=group_key, how="left")
             df = reduce_mem_usage(df.filter(like="fe_te_"))
             print(df.info())
             dfs.append(df)
     self.dump(dfs)
Example #10
0
def target_encoding(train_df: pd.DataFrame) -> pd.DataFrame:
    group_keys = [
        ["item_id"],
        ["item_id", "tm_w"],
        ["item_id", "tm_dw"],
        ["dept_id", "tm_w"],
        ["cat_id", "tm_w"],
        ["store_id", "dept_id"],
        ["store_id", "dept_id", "tm_w"],
        ["store_id", "dept_id", "tm_m"],
        ["store_id", "tm_w"],
        ["store_id", "tm_m"],
        ["store_id", "tm_d"],
        ["store_id", "snap"],
        ["store_id", "snap", "tm_dw"],
        ["state_id", "item_id"],
        ["state_id", "item_id", "tm_dw"],
        ["state_id", "item_id", "tm_w"],
        ["state_id", "item_id", "tm_m"],
        ["state_id", "item_id", "snap"],
        ["state_id", "item_id", "snap", "tm_dw"],
        ["state_id", "item_id", "fe_event"],
        ["state_id", "item_id", "fe_event_dw"],
        ["store_id", "item_id"],
        ["store_id", "item_id", "tm_dw"],
        ["store_id", "item_id", "tm_w"],
        ["store_id", "item_id", "tm_m"],
        ["store_id", "item_id", "tm_d"],
        ["store_id", "item_id", "snap"],
        ["store_id", "item_id", "snap", "tm_dw"],
        ["store_id", "item_id", "fe_event"],
        ["store_id", "item_id", "fe_event_dw"],
    ]

    result: List[Tuple[List[str], pd.DataFrame]] = []
    methods = ["mean", "std"]
    with timer("target encoding"):
        for group_key in tqdm(group_keys):
            columns = []
            columns += group_key
            columns += [
                "fe_te_{}_{}".format("_".join(group_key), method)
                for method in methods
            ]
            tmp_df = (train_df[group_key + ["sales"]].groupby(group_key).agg({
                "sales":
                methods
            }).reset_index())
            tmp_df.columns = columns
            tmp_df.reset_index(inplace=True, drop=True)
            result.append((group_key, tmp_df))
    return result
Example #11
0
 def run(self):
     data: pd.DataFrame = self.load()
     with timer("make rolling features"):
         for lag in [28]:
             for w_size in tqdm([30]):
                 data[f"fe_rolling_kurt_t{lag}_{w_size}"] = (
                     data.groupby(["id"])["sales"]
                     .transform(lambda x: x.shift(lag).rolling(w_size).kurt())
                     .astype(np.float16)
                 )
     df = data.filter(like="fe_rolling_kurt")
     print(df.info())
     self.dump(df)
Example #12
0
 def run(self):
     data: pd.DataFrame = self.load()
     with timer("make rolling mean"):
         lag_wsize = []
         for lag in [1, 14, 7, 28]:
             for w_size in [7, 30, 60, 90, 180]:
                 lag_wsize.append([data[["id", "d", "sales"]], lag, w_size, "mean"])
         data = pd.concat(
             [data, df_parallelize_run(make_lag_roll, lag_wsize)], axis=1
         )
     df = data.filter(like="fe_rolling_mean")
     print(df.info())
     self.dump(df)
Example #13
0
    def run(self):
        data: pd.DataFrame = self.load()

        with timer("make price lag_1 features"):
            data["lag_price_t1"] = data.groupby(
                ["id"])["sell_price"].transform(lambda x: x.shift(1))
            data["fe_price_change_t1"] = (data["lag_price_t1"] -
                                          data["sell_price"]) / (
                                              data["lag_price_t1"])
            data.drop("lag_price_t1", axis=1, inplace=True)

        with timer("make price lag_365 features"):
            data["rolling_price_max_t365"] = data.groupby([
                "id"
            ])["sell_price"].transform(lambda x: x.shift(1).rolling(365).max())
            data["fe_price_change_t365"] = (data["rolling_price_max_t365"] -
                                            data["sell_price"]) / (
                                                data["rolling_price_max_t365"])
            data.drop("rolling_price_max_t365", axis=1, inplace=True)

        df = data.filter(like="fe_price_change")
        print(df.info())
        self.dump(df)
Example #14
0
def cls_postprocessing(cv_num: int, test_pred: pd.DataFrame) -> pd.DataFrame:
    with timer("cls_postprocessing"):
        config = Config()
        df_val: pd.dataframe = pickle.load(
            open(f"./output/cv_cls/{config.CLS_TIMESTAMP}/0/df_val.pkl", "rb")
        )
        test_pred["tmp_id"] = (
            test_pred["id"].astype(str) + "_" + test_pred["d"].astype(str)
        )
        df_val = df_val[df_val["sales_is_zero_pred"] >= config.CLS_THRESHOLD]
        tmp_ids = df_val["id"].astype(str) + "_" + df_val["d"].astype(str)
        test_pred.loc[test_pred["tmp_id"].isin(tmp_ids), "sales"] = 0
        test_pred.drop(["tmp_id"], axis=1, inplace=True)
    return test_pred
    def run(self):
        raw: RawData = self.load("raw")
        data: pd.DataFrame = self.load("data")

        raw.calendar["d"] = raw.calendar["d"].map(lambda d: int(d.replace("d_", "")))

        unemployment = read_unemployment_data(date_range=raw.calendar[["date"]])

        with timer("merge data"):
            data = data.merge(raw.calendar[["d", "date"]], on="d", how="left").merge(
                unemployment, on=["date", "state_id"], how="left"
            )

        df = data.filter(like="fe_unemployment")
        print(df.info())

        self.dump(df)
Example #16
0
def make_lag_roll(LAG_WSIZE: List[Any]):
    df: pd.DataFrame = LAG_WSIZE[0]
    lag = LAG_WSIZE[1]
    w_size = LAG_WSIZE[2]
    method: str = LAG_WSIZE[3]
    # group_ids: List[str] = df.drop(["id", "d", "sales"]).columns.tolist()
    print(lag, w_size, method)

    col_name: str = ""
    if method == "group_mean":
        pass
        # col_name = "fe_rolling_{}_mean_{}_{}".format("_".join(group_ids), lag, w_size)
        # with timer("create {}".format(col_name)):
        #     _tmp = df.groupby(["d"] + group_ids)["sales"].mean().reset_index()
        #     _tmp[col_name] = _tmp.groupby(group_ids)["sales"].transform(
        #         lambda x: x.shift(lag).rolling(w_size).mean()
        #     )
        #     _tmp.drop("sales", axis=1, inplace=True)
        #     df = df.merge(_tmp, on=["d"] + group_ids, how="left")

    else:
        col_name = f"fe_rolling_{method}_t{lag}_{w_size}"
        with timer("create {}".format(col_name)):
            if method == "mean":
                df[col_name] = (
                    df.groupby("id")["sales"]
                    .transform(lambda x: x.shift(lag).rolling(w_size).mean())
                    .astype(np.float16)
                )
            if method == "std":
                df[col_name] = (
                    df.groupby("id")["sales"]["sales"]
                    .transform(lambda x: x.shift(lag).rolling(w_size).std())
                    .astype(np.float16)
                )
            if method == "dw_mean":
                df[col_name] = (
                    df.groupby(["id", "tm_dw"])["sales"]
                    .transform(lambda x: x.shift(lag).rolling(w_size).mean())
                    .astype(np.float16)
                )

    return df[[col_name]]
def read_unemployment_data(
    date_range: pd.DataFrame, external_data_path: str = "./external_data"
) -> pd.DataFrame:
    files: Dict[str, int] = {
        "CA.csv": 0,
        "TX.csv": 1,
        "WI.csv": 2,
    }

    unemployment: pd.DataFrame = pd.DataFrame()
    with timer("load unemployment data"):
        if os.path.exists(f"{external_data_path}/unemployment"):
            for file_name, state_id in files.items():
                _tmp_unemployment = pd.read_csv(
                    f"{external_data_path}/unemployment/{file_name}"
                )
                _tmp_unemployment["date"] = pd.to_datetime(
                    _tmp_unemployment["DATE"]
                ).dt.strftime("%Y-%m-%d")
                _tmp_unemployment.drop("DATE", axis=1, inplace=True)
                _tmp_unemployment.rename(
                    {"{}UR".format(file_name.replace(".csv", "")): "fe_unemployment"},
                    axis=1,
                    inplace=True,
                )
                _tmp_unemployment = date_range.merge(
                    _tmp_unemployment, on="date", how="left"
                )
                _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[
                    "fe_unemployment"
                ].interpolate()
                _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[
                    "fe_unemployment"
                ].fillna(method="bfill")
                _tmp_unemployment["state_id"] = state_id
                unemployment = pd.concat([unemployment, _tmp_unemployment], axis=0)
                del _tmp_unemployment
    return unemployment
Example #18
0
def train(
    cv_num: int,
    params: Dict[str, Any],
    train_set: lgb.Dataset,
    valid_sets: List[lgb.Dataset],
    verbose_eval: int,
    early_stopping_rounds: Optional[int] = None,
    model_number: Optional[int] = None,
) -> lgb.Booster:
    config = Config()
    timer_name: str = f"train CV_{cv_num}"
    if model_number:
        timer_name += f"_{model_number}"
    with timer(timer_name, mlflow_on=True):
        model = lgb.train(
            params,
            train_set,
            num_boost_round=config.num_boost_round,
            verbose_eval=verbose_eval,
            # early_stopping_rounds=early_stopping_rounds,
            valid_sets=valid_sets,
        )
    return model
Example #19
0
 def run(self):
     data: pd.DataFrame = self.load()
     groups = [
         ["item_id"],
         ["store_id"],
         ["state_id", "item_id"],
     ]
     with timer("make group std"):
         for group in tqdm(groups):
             for lag in tqdm([28]):
                 for w_size in tqdm([7, 30, 180]):
                     col_name = "fe_rolling_{}_std_{}_{}".format(
                         "_".join(group), lag, w_size
                     )
                     _tmp = data.groupby(["d"] + group)["sales"].mean().reset_index()
                     _tmp[col_name] = _tmp.groupby(group)["sales"].transform(
                         lambda x: x.shift(lag).rolling(w_size).std()
                     )
                     _tmp.drop("sales", axis=1, inplace=True)
                     data = data.merge(_tmp, on=["d"] + group, how="left")
     df = data.filter(like="fe_rolling_")
     print(df.info())
     self.dump(df)
    def run(self):
        raw: RawData = self.load("raw")
        data: pd.DataFrame = self.load("data")

        raw.calendar["d"] = raw.calendar["d"].map(
            lambda d: int(d.replace("d_", "")))

        df = pd.DataFrame()
        if os.path.isfile("./external_data/stock.csv"):
            stock = pd.read_csv("./external_data/stock.csv")
            stock.columns = [
                "date", "close_last", "volume", "open", "high", "low"
            ]
            stock["date"] = pd.to_datetime(
                stock["date"]).dt.strftime("%Y-%m-%d")
            for col in ["close_last", "open", "high", "low"]:
                stock[col] = stock[col].map(
                    lambda x: float(x.replace("$", "")))
            stock = stock[["date", "close_last", "volume"]]
            stock.columns = ["date", "fe_stock_price", "fe_stock_volume"]
            stock = raw.calendar[["date"]].merge(stock, on="date", how="left")
            stock["fe_stock_price"] = (stock["fe_stock_price"].fillna(
                method="ffill").fillna(method="bfill"))
            stock["fe_stock_volume"] = (stock["fe_stock_volume"].fillna(
                method="ffill").fillna(method="bfill"))

            with timer("merge data"):
                data = data.merge(raw.calendar[["d", "date"]],
                                  on="d",
                                  how="left").merge(stock,
                                                    on="date",
                                                    how="left")

            df = data.filter(like="fe_stock")
            print(df.info())

        self.dump(df)
Example #21
0
def train_cls(
    cv_num: int,
    params: Dict[str, Any],
    train_set: lgb.Dataset,
    valid_sets: List[lgb.Dataset],
    verbose_eval: int,
    early_stopping_rounds: Optional[int] = None,
    model_number: Optional[int] = None,
) -> LGBMClassifier:
    config = Config()
    timer_name: str = f"train CV_{cv_num}"
    if model_number:
        timer_name += f"_{model_number}"
    with timer(timer_name, mlflow_on=True):
        model = LGBMClassifier(**config.lgbm_cls_params)
        model.fit(
            train_set.data,
            train_set.label,
            categorical_feature=config.lgbm_cat_features,
            eval_set=[(dataset.data, dataset.label) for dataset in valid_sets],
            eval_metric="logloss,auc,cross_entropy",
            verbose=10,
        )
    return model
import sklearn.preprocessing
import sklearn.cluster

import sys
import os

sys.path.append(os.getcwd() + "/../..")
from kaggle_m5_forecasting.utils import timer

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = tb.get_data("MakeData")
data = data[data.d < 1942]

# %%

with timer("calc grouped aggregates"):
    grouped = data.groupby(["id"])["sales"].agg({
        "mean":
        lambda x: x.dropna().values.mean(),
        "percentile25":
        lambda x: x.dropna().sort_values()[:int(len(x) * 0.25)].mean(),
        "percentile50":
        lambda x: x.dropna().sort_values()[int(len(x) * 0.25):int(
            len(x) * 0.5)].mean(),
        "percentile75":
        lambda x: x.dropna().sort_values()[int(len(x) * 0.5):int(
            len(x) * 0.75)].mean(),
        "percentile100":
        lambda x: x.dropna().sort_values()[int(len(x) * 0.75):].mean(),
        "std":
        lambda x: x.dropna().values.std(),
Example #23
0
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from thunderbolt import Thunderbolt
import sys
import os

sys.path.append(os.getcwd() + "/../..")
from kaggle_m5_forecasting.utils import timer

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = tb.get_data("MakeData")

# %%

with timer("calc rolling_store_id_cat_id_mean"):
    lag = 28
    w_size = 30
    data["fe_rolling_store_id_cat_id_mean"] = data.groupby([
        "store_id", "cat_id"
    ])["sales"].transform(lambda x: x.shift(lag).rolling(w_size).mean())

# %%
tb = Thunderbolt("./../../resource")
tb.get_data("FERollingGroupMean")
Example #24
0
    def run(self):
        raw: RawData = self.load()

        id_vars = [
            "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
        ]

        with timer("melt sales_train_validation"):
            data: pd.DataFrame = pd.melt(
                raw.sales_train_validation,
                id_vars=id_vars,
                var_name="d",
                value_name="sales",
            )
            print_mem_usage(data)

        with timer("add test data"):
            add_df = pd.DataFrame()
            for i in tqdm(range(1, 29)):
                tmp_df = raw.sales_train_validation[id_vars].drop_duplicates()
                tmp_df["d"] = f"d_{1941+i}"
                tmp_df["sales"] = np.nan
                add_df = pd.concat([add_df, tmp_df])
            data = pd.concat([data, add_df]).reset_index(drop=True)
            del add_df
            print_mem_usage(data)

        with timer("str to category"):
            for col in tqdm(id_vars):
                data[col] = data[col].astype("category")
            print_mem_usage(data)

        with timer("merge release"):
            data = merge_by_concat(
                data,
                raw.sell_prices.groupby(
                    ["store_id",
                     "item_id"])["wm_yr_wk"].agg(release=np.min).reset_index(),
                ["store_id", "item_id"],
            )
            print_mem_usage(data)

        with timer("merge wm_yr_wk"):
            data = merge_by_concat(data, raw.calendar[["wm_yr_wk", "d"]],
                                   ["d"])
            print_mem_usage(data)

        with timer("cutoff data before release"):
            data = data[data["wm_yr_wk"] >= data["release"]].reset_index(
                drop=True)
            print_mem_usage(data)

        reduce_mem_usage(data)

        with timer("make calendar events"):
            raw.calendar["cal_blackfriday"] = (
                raw.calendar["date"].str[5:].isin([
                    "2011-11-25",
                    "2012-11-23",
                    "2013-11-29",
                    "2014-11-28",
                    "2015-11-27",
                ])).astype(np.int8)
            raw.calendar.loc[raw.calendar["cal_blackfriday"] == 1,
                             "event_name_1"] = "BlackFriday"
            raw.calendar.loc[raw.calendar["cal_blackfriday"] == 1,
                             "event_type_1"] = "other"

        with timer("merge calendar"):
            icols = [
                "event_name_1",
                "event_type_1",
                "event_name_2",
                "event_type_2",
                "snap_CA",
                "snap_TX",
                "snap_WI",
            ]
            data = data.merge(
                raw.calendar.drop(
                    [
                        "wm_yr_wk", "weekday", "wday", "month", "year",
                        "cal_blackfriday"
                    ],
                    axis=1,
                ),
                on=["d"],
                how="left",
            )
            for col in tqdm(icols):
                data[col].fillna("unknown", inplace=True)
                data[col] = data[col].astype("category")
            data["date"] = pd.to_datetime(data["date"])
            print_mem_usage(data)

        with timer("make snap"):
            data["snap"] = 0
            data.loc[(data.snap_CA == 1) & (data.state_id == "CA"), "snap"] = 1
            data.loc[(data.snap_TX == 1) & (data.state_id == "TX"), "snap"] = 1
            data.loc[(data.snap_WI == 1) & (data.state_id == "WI"), "snap"] = 1

        with timer("make some features from date"):
            data["tm_d"] = data["date"].dt.day.astype(np.int8)
            data["tm_w"] = data["date"].dt.week.astype(np.int8)
            data["tm_m"] = data["date"].dt.month.astype(np.int8)
            data["tm_y"] = data["date"].dt.year
            data["tm_quarter"] = data["date"].dt.quarter.astype(np.int8)
            data["tm_y"] = (data["tm_y"] - data["tm_y"].min()).astype(np.int8)
            data["tm_wm"] = data["tm_d"].apply(
                lambda x: np.ceil(x / 7)).astype(np.int8)
            data["tm_dw"] = data["date"].dt.dayofweek.astype(np.int8)
            data["tm_w_end"] = (data["tm_dw"] >= 5).astype(np.int8)
            # data["tm_moon_phase"] = (
            #     data["date"].map(lambda d: get_moon_phase(d)).astype(np.int8)
            # )
            data.loc[data["event_type_1"] == "National", "tm_w_end"] = 1
            del data["date"]
            print_mem_usage(data)

        with timer("merge sell_prices"):
            data = data.merge(raw.sell_prices,
                              on=["store_id", "item_id", "wm_yr_wk"],
                              how="left")

        with timer("convert 'd' to int"):
            data["d"] = data["d"].apply(lambda x: x[2:]).astype(np.int16)
            data["sales_is_zero"] = (data["sales"] == 0).astype(np.int8)
            print_mem_usage(data)

        with timer("label encoding"):
            cat_encoders: Dict[str, sklearn.preprocessing.LabelEncoder] = {}
            cat_features: List[str] = [
                "item_id",
                "dept_id",
                "cat_id",
                "store_id",
                "state_id",
                "event_name_1",
                "event_type_1",
                "event_name_2",
                "event_type_2",
            ]
            for feature in tqdm(cat_features):
                encoder = sklearn.preprocessing.LabelEncoder()
                encoder.fit(data[feature])
                data[feature] = encoder.transform(data[feature])
                cat_encoders[feature] = encoder
            pickle.dump(cat_encoders, open("./cat_encoders.pkl", "wb"))

        print(data.info())

        self.dump(data)
Example #25
0
    def run(self):
        data: pd.DataFrame = self.load()
        data = data[data.d < 1942]

        with timer("calc grouped aggregates"):
            catch22_df = data.groupby(["id"])["sales"].agg(
                mean=lambda x: x.dropna().values.mean(),
                percentile25=lambda x: x.dropna().sort_values()[:int(
                    len(x) * 0.25)].mean(),
                percentile50=lambda x: x.dropna().sort_values()[int(
                    len(x) * 0.25):int(len(x) * 0.5)].mean(),
                percentile75=lambda x: x.dropna().sort_values()[int(
                    len(x) * 0.5):int(len(x) * 0.75)].mean(),
                percentile100=lambda x: x.dropna().sort_values()[int(
                    len(x) * 0.75):].mean(),
                std=lambda x: x.dropna().values.std(),
                CO_Embed2_Dist_tau_d_expfit_meandiff=lambda x: catch22.
                CO_Embed2_Dist_tau_d_expfit_meandiff(x.dropna().tolist()),
                CO_f1ecac=lambda x: catch22.CO_f1ecac(x.dropna().tolist()),
                CO_FirstMin_ac=lambda x: catch22.CO_FirstMin_ac(x.dropna().
                                                                tolist()),
                CO_HistogramAMI_even_2_5=lambda x: catch22.
                CO_HistogramAMI_even_2_5(x.dropna().tolist()),
                CO_trev_1_num=lambda x: catch22.CO_trev_1_num(x.dropna().
                                                              tolist()),
                DN_HistogramMode_10=lambda x: catch22.DN_HistogramMode_10(
                    x.dropna().tolist()),
                DN_HistogramMode_5=lambda x: catch22.DN_HistogramMode_5(
                    x.dropna().tolist()),
                DN_OutlierInclude_n_001_mdrmd=lambda x: catch22.
                DN_OutlierInclude_n_001_mdrmd(x.dropna().tolist()),
                DN_OutlierInclude_p_001_mdrmd=lambda x: catch22.
                DN_OutlierInclude_p_001_mdrmd(x.dropna().tolist()),
                FC_LocalSimple_mean1_tauresrat=lambda x: catch22.
                FC_LocalSimple_mean1_tauresrat(x.dropna().tolist()),
                FC_LocalSimple_mean3_stderr=lambda x: catch22.
                FC_LocalSimple_mean3_stderr(x.dropna().tolist()),
                IN_AutoMutualInfoStats_40_gaussian_fmmi=lambda x: catch22.
                IN_AutoMutualInfoStats_40_gaussian_fmmi(x.dropna().tolist()),
                MD_hrv_classic_pnn40=lambda x: catch22.MD_hrv_classic_pnn40(
                    x.dropna().tolist()),
                PD_PeriodicityWang_th0_01=lambda x: catch22.
                PD_PeriodicityWang_th0_01(x.dropna().tolist()),
                SB_BinaryStats_diff_longstretch0=lambda x: catch22.
                SB_BinaryStats_diff_longstretch0(x.dropna().tolist()),
                SB_BinaryStats_mean_longstretch1=lambda x: catch22.
                SB_BinaryStats_mean_longstretch1(x.dropna().tolist()),
                SB_MotifThree_quantile_hh=lambda x: catch22.
                SB_MotifThree_quantile_hh(x.dropna().tolist()),
                SB_TransitionMatrix_3ac_sumdiagcov=lambda x: catch22.
                SB_TransitionMatrix_3ac_sumdiagcov(x.dropna().tolist()),
                SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1=lambda x: catch22.
                SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1(x.dropna().tolist()),
                SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1=lambda x: catch22.
                SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(x.dropna().tolist(
                )),
                SP_Summaries_welch_rect_area_5_1=lambda x: catch22.
                SP_Summaries_welch_rect_area_5_1(x.dropna().tolist()),
                SP_Summaries_welch_rect_centroid=lambda x: catch22.
                SP_Summaries_welch_rect_centroid(x.dropna().tolist()),
            )

        print(catch22_df.info())
        self.dump(catch22_df)
    def run(self):
        data: pd.DataFrame = self.load("data")
        raw: RawData = self.load("raw")

        prices_df: pd.DataFrame = raw.sell_prices.copy()
        with timer("basic price aggregations"):
            prices_df["fe_price_max"] = prices_df.groupby(
                ["store_id", "item_id"])["sell_price"].transform(np.max)
            prices_df["fe_price_min"] = prices_df.groupby(
                ["store_id", "item_id"])["sell_price"].transform(np.min)
            prices_df["fe_price_std"] = prices_df.groupby(
                ["store_id", "item_id"])["sell_price"].transform(np.std)
            prices_df["fe_price_mean"] = prices_df.groupby(
                ["store_id", "item_id"])["sell_price"].transform(np.mean)
            prices_df["fe_price_discount"] = (prices_df["fe_price_mean"] -
                                              prices_df["sell_price"])
            prices_df["fe_price_discount_rate"] = (
                prices_df["fe_price_discount"] / prices_df["fe_price_mean"])
            prices_df["fe_price_skew"] = prices_df.groupby(
                ["store_id",
                 "item_id"])["sell_price"].transform(lambda x: x.skew())
            prices_df["fe_price_kurt"] = prices_df.groupby(
                ["store_id",
                 "item_id"])["sell_price"].transform(lambda x: x.kurt())
            prices_df["fe_price_norm"] = (prices_df["sell_price"] /
                                          prices_df["fe_price_max"])
            prices_df["fe_price_nunique"] = prices_df.groupby(
                ["store_id", "item_id"])["sell_price"].transform("nunique")
            prices_df["fe_price_item_nunique"] = prices_df.groupby(
                ["store_id", "sell_price"])["item_id"].transform("nunique")
            prices_df = prices_df.merge(
                raw.calendar[["wm_yr_wk", "month",
                              "year"]].drop_duplicates(subset=["wm_yr_wk"]),
                on=["wm_yr_wk"],
                how="left",
            )

        with timer("calc price momentum"):
            prices_df["fe_price_momentum"] = prices_df[
                "sell_price"] / prices_df.groupby([
                    "store_id", "item_id"
                ])["sell_price"].transform(lambda x: x.shift(1))
            prices_df["fe_price_momentum_m"] = prices_df[
                "sell_price"] / prices_df.groupby([
                    "store_id", "item_id", "month"
                ])["sell_price"].transform("mean")
            prices_df["fe_price_momentum_y"] = prices_df[
                "sell_price"] / prices_df.groupby([
                    "store_id", "item_id", "year"
                ])["sell_price"].transform("mean")
            del prices_df["month"], prices_df["year"]

        with timer("merge prices_df"):
            cat_encoders: Dict[
                str, sklearn.preprocessing.LabelEncoder] = pickle.load(
                    open("./cat_encoders.pkl", "rb"))
            for col in ["store_id", "item_id"]:
                prices_df[col] = cat_encoders[col].transform(prices_df[col])
            data = data.merge(prices_df,
                              on=["store_id", "item_id", "wm_yr_wk"],
                              how="left")

        df = data.filter(like="fe_price")
        df = reduce_mem_usage(df)
        print(df.info())
        self.dump(df)
Example #27
0
def target_encoding_catch22(train_df: pd.DataFrame) -> pd.DataFrame:
    group_keys = [
        ["store_id", "item_id"],
    ]
    result: List[Tuple[List[str], pd.DataFrame]] = []
    with timer("target encoding"):
        for group_key in tqdm(group_keys):
            with timer("{} te".format(str(group_key))):
                tmp_df = train_df.groupby(group_key)["sales"].agg({
                    "fe_te_CO_Embed2_Dist_tau_d_expfit_meandiff":
                    lambda x: catch22.CO_Embed2_Dist_tau_d_expfit_meandiff(
                        x.tolist()),
                    "fe_te_CO_f1ecac":
                    lambda x: catch22.CO_f1ecac(x.tolist()),
                    "fe_te_CO_FirstMin_ac":
                    lambda x: catch22.CO_FirstMin_ac(x.tolist()),
                    "fe_te_CO_HistogramAMI_even_2_5":
                    lambda x: catch22.CO_HistogramAMI_even_2_5(x.tolist()),
                    "fe_te_CO_trev_1_num":
                    lambda x: catch22.CO_trev_1_num(x.tolist()),
                    "fe_te_DN_HistogramMode_10":
                    lambda x: catch22.DN_HistogramMode_10(x.tolist()),
                    "fe_te_DN_HistogramMode_5":
                    lambda x: catch22.DN_HistogramMode_5(x.tolist()),
                    "fe_te_DN_OutlierInclude_n_001_mdrmd":
                    lambda x: catch22.DN_OutlierInclude_n_001_mdrmd(x.tolist()
                                                                    ),
                    "fe_te_DN_OutlierInclude_p_001_mdrmd":
                    lambda x: catch22.DN_OutlierInclude_p_001_mdrmd(x.tolist()
                                                                    ),
                    "fe_te_FC_LocalSimple_mean1_tauresrat":
                    lambda x: catch22.FC_LocalSimple_mean1_tauresrat(x.tolist(
                    )),
                    "fe_te_FC_LocalSimple_mean3_stderr":
                    lambda x: catch22.FC_LocalSimple_mean3_stderr(x.tolist()),
                    "fe_te_IN_AutoMutualInfoStats_40_gaussian_fmmi":
                    lambda x: catch22.IN_AutoMutualInfoStats_40_gaussian_fmmi(
                        x.tolist()),
                    "fe_te_MD_hrv_classic_pnn40":
                    lambda x: catch22.MD_hrv_classic_pnn40(x.tolist()),
                    "fe_te_PD_PeriodicityWang_th0_01":
                    lambda x: catch22.PD_PeriodicityWang_th0_01(x.tolist()),
                    "fe_te_SB_BinaryStats_diff_longstretch0":
                    lambda x: catch22.SB_BinaryStats_diff_longstretch0(
                        x.tolist()),
                    "fe_te_SB_BinaryStats_mean_longstretch1":
                    lambda x: catch22.SB_BinaryStats_mean_longstretch1(
                        x.tolist()),
                    "fe_te_SB_MotifThree_quantile_hh":
                    lambda x: catch22.SB_MotifThree_quantile_hh(x.tolist()),
                    "fe_te_SB_TransitionMatrix_3ac_sumdiagcov":
                    lambda x: catch22.SB_TransitionMatrix_3ac_sumdiagcov(
                        x.tolist()),
                    "fe_te_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1":
                    lambda x: catch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1(
                        x.tolist()),
                    "fe_te_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1":
                    lambda x: catch22.
                    SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(x.tolist()),
                    "fe_te_SP_Summaries_welch_rect_area_5_1":
                    lambda x: catch22.SP_Summaries_welch_rect_area_5_1(
                        x.tolist()),
                    "fe_te_SP_Summaries_welch_rect_centroid":
                    lambda x: catch22.SP_Summaries_welch_rect_centroid(
                        x.tolist()),
                })
                tmp_df.reset_index(inplace=True, drop=True)
                result.append((group_key, tmp_df))
    return result