Example #1
0
 def run(self):
     required_columns = {self.index_columns, self.target_column}
     dataset = self.load_data_frame(required_columns=required_columns,
                                    drop_columns=True)
     dataset = dataset.set_index(self.index_columns)
     dataset = reduce_mem_usage(dataset)
     self.dump(dataset)
Example #2
0
    def run(self):
        required_columns = {self.index_columns, self.target_column, "target"}
        dataset = self.load_data_frame(required_columns=required_columns,
                                       drop_columns=True)
        dataset = dataset.set_index(self.index_columns)

        train = dataset.loc[dataset[self.predict_column].notna(),
                            self.target_column]
        test = dataset.loc[dataset[self.predict_column].isna(),
                           self.target_column]

        categories = train.dropna().unique()

        train_dummied = pd.get_dummies(
            pd.Categorical(train, categories),
            prefix="OHE_" + self.target_column,
            dummy_na=True,
        )
        train_dummied.index = train.index

        test_dummied = pd.get_dummies(
            pd.Categorical(test, categories),
            prefix="OHE_" + self.target_column,
            dummy_na=True,
        )
        test_dummied.index = test.index
        result = reduce_mem_usage(
            pd.concat([train_dummied, test_dummied]).sort_index())
        self.dump(result)
Example #3
0
 def run(self):
     required_columns = {self.index_columns, "month"}
     dataset = self.load_data_frame(required_columns=required_columns,
                                    drop_columns=True)
     dataset = dataset.set_index(self.index_columns)
     dataset["month_sin"] = np.sin(2 * np.pi * dataset["month"] / 12)
     dataset["month_cos"] = np.cos(2 * np.pi * dataset["month"] / 12)
     dataset = dataset[["month_sin", "month_cos"]].fillna(-10)
     dataset = reduce_mem_usage(dataset)
     self.dump(dataset)
Example #4
0
    def run(self):
        required_columns = {self.index_columns, "ord_0"}
        dataset = self.load_data_frame(required_columns=required_columns,
                                       drop_columns=True)
        dataset = dataset.set_index(self.index_columns)
        dataset = dataset.fillna(0)
        dataset["ord_0"] = dataset["ord_0"].astype(int)
        dataset = dataset.rename(columns={"ord_0": "Ordinary_ord_0"})

        dataset = reduce_mem_usage(dataset)
        self.dump(dataset)
Example #5
0
 def run(self):
     required_columns = {self.index_columns, self.target_column}
     dataset = self.load_data_frame(required_columns=required_columns,
                                    drop_columns=True)
     dataset = dataset.set_index(self.index_columns)
     dataset[self.target_column] = dataset[self.target_column].map(
         self.ordinary_map)
     dataset = dataset.fillna(0)
     dataset[self.target_column] = dataset[self.target_column].astype(int)
     dataset = dataset.rename(
         columns={self.target_column: "Ordinary_" + self.target_column})
     dataset = reduce_mem_usage(dataset)
     self.dump(dataset)
    def set_index(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        to_history_date以降のデータに対してindexをsetして、メモリを削減したデータフレームを返す。

        Args:
            data (pd.DataFrame): 特徴量のデータフレーム

        Returns:
            pd.DataFrame:
        """
        data = data.query(f"d > {self.to_history_date}")
        data = data.set_index(self.index_columns)
        data = reduce_mem_usage(data)
        return data
Example #7
0
    def run(self):
        required_columns = {self.index_columns, "ord_5"}
        dataset = self.load_data_frame(required_columns=required_columns,
                                       drop_columns=True)
        dataset = dataset.set_index(self.index_columns)
        map_ord5 = {
            key: value + 1
            for value, key in enumerate(
                sorted(dataset["ord_5"].dropna().unique()))
        }
        dataset["Ordinary_ord_5"] = dataset["ord_5"].map(map_ord5)
        dataset = dataset[["Ordinary_ord_5"]]
        dataset = dataset.fillna(0).astype(int)

        dataset = reduce_mem_usage(dataset)
        self.dump(dataset)
Example #8
0
    def run(self):
        required_columns = {self.index_columns, self.target_column}
        dataset = self.load_data_frame(required_columns=required_columns,
                                       drop_columns=True)
        dataset = dataset.set_index(self.index_columns)
        dataset[self.target_column] = (
            dataset[self.target_column].astype(str).fillna("nan"))

        encoder = LabelEncoder()
        dataset[self.target_column] = encoder.fit_transform(
            dataset[self.target_column])

        dataset = dataset.rename(
            columns={
                self.target_column: "BinaryCategorical_" + self.target_column
            })

        dataset = reduce_mem_usage(dataset)
        self.dump(dataset)
Example #9
0
    def run(self):
        required_columns = {self.index_columns, self.target_column, "target"}
        dataset = self.load_data_frame(required_columns=required_columns,
                                       drop_columns=True)
        dataset = dataset.set_index(self.index_columns)

        train = dataset[dataset[self.predict_column].notna()]

        encoding = (
            train[self.target_column].value_counts().reset_index().rename(
                columns={
                    self.target_column: "count_encode_" + self.target_column,
                    "index": self.target_column,
                }))
        result = ((dataset[[self.target_column]].reset_index().merge(
            encoding, on=self.target_column, how="left")).fillna(-10).drop(
                columns=self.target_column).set_index(self.index_columns))
        result = reduce_mem_usage(result)

        self.dump(result)
Example #10
0
    def run(self):
        self.target_columns = self.target_columns.split(",")
        encoder = self.get_encoder()
        encoder_for_test = deepcopy(encoder)

        dataset: pd.DataFrame = self.load_data_frame("dataset").set_index(
            self.index_columns)
        fold = self.load("fold")

        train = dataset[dataset[self.predict_column].notna()]
        train_y = train[self.predict_column]
        test = dataset[dataset[self.predict_column].isna()]

        encoded_train: pd.DataFrame = pd.DataFrame()

        for trn_idx, val_idx in fold.split(train, train_y):
            encoder.fit(train.iloc[trn_idx], train_y.iloc[trn_idx])
            encoded_train = pd.concat([
                encoded_train,
                encoder.transform(train.iloc[val_idx])[self.target_columns],
            ])

        encoder_for_test.fit(train, train_y)
        encoded_test = encoder_for_test.transform(test)

        encoded_dataset = pd.concat([encoded_train, encoded_test
                                     ])[self.target_columns].sort_index()

        rename_map = {
            col: encoder.__class__.__name__ + "_" + col
            for col in self.target_columns
        }
        encoded_dataset = encoded_dataset.rename(columns=rename_map)
        encoded_dataset = reduce_mem_usage(encoded_dataset)

        self.dump(encoded_dataset)
    def __init__(self, load=False, debug=True):
        def encode_categorical(df, cols):
            for col in cols:
                # Leave NaN as it is.
                le = LabelEncoder()
                df[col] = df[col].fillna("nan")
                df[col] = pd.Series(le.fit_transform(df[col]), index=df.index)
            return df

        def weight_calc(weight_mat_csr, data, product):

            # calculate the denominator of RMSSE, and calculate the weight base on sales amount

            sales_train_val = pd.read_csv(
                "../input/m5-forecasting-accuracy/sales_train_validation.csv")

            d_name = ["d_" + str(i + 1) for i in range(1913)]

            sales_train_val = weight_mat_csr * sales_train_val[d_name].values

            # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日
            # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算
            df_tmp = (sales_train_val > 0) * np.tile(np.arange(
                1, 1914), (weight_mat_csr.shape[0], 1))

            start_no = np.min(np.where(df_tmp == 0, 9999, df_tmp), axis=1) - 1

            flag = (np.dot(
                np.diag(1 / (start_no + 1)),
                np.tile(np.arange(1, 1914), (weight_mat_csr.shape[0], 1)),
            ) < 1)

            sales_train_val = np.where(flag, np.nan, sales_train_val)

            # denominator of RMSSE / RMSSEの分母
            weight1 = np.nansum(np.diff(sales_train_val, axis=1)**2,
                                axis=1) / (1913 - start_no)

            # calculate the sales amount for each item/level
            df_tmp = data[(data["date"] > "2016-03-27")
                          & (data["date"] <= "2016-04-24")]
            df_tmp["amount"] = df_tmp["demand"] * df_tmp["sell_price"]
            df_tmp = df_tmp.groupby(["id"])["amount"].apply(np.sum)
            df_tmp = df_tmp[product.id].values

            weight2 = weight_mat_csr * df_tmp

            weight2 = weight2 / np.sum(weight2)

            del sales_train_val
            gc.collect()

            return weight1, weight2

        if load:
            print("loadするぜ")
            self.data = pd.read_pickle("./evaluator_data.pkl")
            self.weight_mat_csr = sparse.load_npz(
                "./evaluator_weight_mat_csr.npz")
            self.weight1 = np.load("./evaluator_weight1.npy",
                                   allow_pickle=True)
            self.weight2 = np.load("./evaluator_weight2.npy",
                                   allow_pickle=True)
        else:
            print("read_csv中")
            calendar = pd.read_csv(
                "../input/m5-forecasting-accuracy/calendar.csv")
            sell_prices = pd.read_csv(
                "../input/m5-forecasting-accuracy/sell_prices.csv")
            sales_train_val = pd.read_csv(
                "../input/m5-forecasting-accuracy/sales_train_validation.csv")
            submission = pd.read_csv(
                "../input/m5-forecasting-accuracy/sample_submission.csv")

            print("encode中")
            # encode for memory
            calendar = encode_categorical(
                calendar,
                [
                    "event_name_1", "event_type_1", "event_name_2",
                    "event_type_2"
                ],
            ).pipe(reduce_mem_usage)

            sales_train_val = encode_categorical(
                sales_train_val,
                ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
            ).pipe(reduce_mem_usage)

            sell_prices = encode_categorical(
                sell_prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)

            product = sales_train_val[[
                "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
            ]].drop_duplicates()

            # to remove data before first non-zero demand date, replace these demand as np.nan.
            d_name = ["d_" + str(i + 1) for i in range(1913)]
            sales_train_val_values = sales_train_val[d_name].values

            # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日
            # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算
            tmp = np.tile(np.arange(1, 1914),
                          (sales_train_val_values.shape[0], 1))
            df_tmp = (sales_train_val_values > 0) * tmp
            start_no = np.min(np.where(df_tmp == 0, 9999, df_tmp), axis=1) - 1
            flag = np.dot(np.diag(1 / (start_no + 1)), tmp) < 1
            sales_train_val_values = np.where(flag, np.nan,
                                              sales_train_val_values)

            sales_train_val[d_name] = sales_train_val_values
            del tmp, sales_train_val_values

            sales_train_val = pd.melt(
                sales_train_val,
                id_vars=[
                    "id", "item_id", "dept_id", "cat_id", "store_id",
                    "state_id"
                ],
                var_name="day",
                value_name="demand",
            )

            if debug:
                nrows = 365 * 2 * NUM_ITEMS
                sales_train_val = sales_train_val.iloc[-nrows:, :]

            print("data計算中")
            sales_train_val = sales_train_val[~sales_train_val["demand"].
                                              isnull()]

            # submission fileのidのvalidation部分と, ealuation部分の名前を取得
            test1_rows = [
                row for row in submission["id"] if "validation" in row
            ]
            test2_rows = [
                row for row in submission["id"] if "evaluation" in row
            ]

            # submission fileのvalidation部分をtest1, ealuation部分をtest2として取得
            test1 = submission[submission["id"].isin(test1_rows)]
            test2 = submission[submission["id"].isin(test2_rows)]

            # test1, test2の列名の"F_X"の箇所をd_XXX"の形式に変更
            test1.columns = ["id"] + [
                f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)
            ]
            test2.columns = ["id"] + [
                f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)
            ]

            # test2のidの'_evaluation'を置換
            test2["id"] = test2["id"].str.replace("_evaluation", "_validation")

            # idをキーにして, idの詳細部分をtest1, test2に結合する.
            test1 = test1.merge(product, how="left", on="id")
            test2 = test2.merge(product, how="left", on="id")

            # test1, test2をともにmelt処理する.(売上数量:demandは0)
            test1 = pd.melt(
                test1,
                id_vars=[
                    "id", "item_id", "dept_id", "cat_id", "store_id",
                    "state_id"
                ],
                var_name="day",
                value_name="demand",
            )

            test2 = pd.melt(
                test2,
                id_vars=[
                    "id", "item_id", "dept_id", "cat_id", "store_id",
                    "state_id"
                ],
                var_name="day",
                value_name="demand",
            )

            # validation部分と, evaluation部分がわかるようにpartという列を作り、 test1,test2のラベルを付ける。
            sales_train_val["part"] = "train"
            test1["part"] = "test1"
            test2["part"] = "test2"

            # sales_train_valとtest1, test2の縦結合.
            data = pd.concat([sales_train_val, test1, test2], axis=0)

            # memoryの開放
            del sales_train_val, test1, test2

            # delete test2 for now(6/1以前は, validation部分のみ提出のため.)
            data = data[data["part"] != "test2"]

            # drop some calendar features(不要な変数の削除:weekdayやwdayなどはdatetime変数から後ほど作成できる。)
            calendar.drop(["weekday", "wday", "month", "year"],
                          inplace=True,
                          axis=1)
            # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)(dayとdをキーにdataに結合)
            data = pd.merge(data,
                            calendar,
                            how="left",
                            left_on=["day"],
                            right_on=["d"])
            data.drop(["d", "day"], inplace=True, axis=1)
            # memoryの開放
            del calendar

            # sell priceの結合
            # get the sell price data (this feature should be very important)
            data = data.merge(sell_prices,
                              on=["store_id", "item_id", "wm_yr_wk"],
                              how="left")
            print(
                "Our final dataset to train has {} rows and {} columns".format(
                    data.shape[0], data.shape[1]))
            # memoryの開放
            del sell_prices

            self.data = reduce_mem_usage(data)
            self.data.to_pickle("evaluator_data.pkl")

            print("weight計算中")
            weight_mat = np.c_[
                np.ones([NUM_ITEMS, 1]).astype(np.int8),  # level 1
                pd.get_dummies(product.state_id.astype(str), drop_first=False
                               ).astype("int8").values,
                pd.get_dummies(product.store_id.astype(str), drop_first=False
                               ).astype("int8").values,
                pd.get_dummies(product.cat_id.astype(str), drop_first=False
                               ).astype("int8").values,
                pd.get_dummies(product.dept_id.astype(str), drop_first=False
                               ).astype("int8").values,
                pd.get_dummies(
                    product.state_id.astype(str) + product.cat_id.astype(str),
                    drop_first=False,
                ).astype("int8").values,
                pd.get_dummies(
                    product.state_id.astype(str) + product.dept_id.astype(str),
                    drop_first=False,
                ).astype("int8").values,
                pd.get_dummies(
                    product.store_id.astype(str) + product.cat_id.astype(str),
                    drop_first=False,
                ).astype("int8").values,
                pd.get_dummies(
                    product.store_id.astype(str) + product.dept_id.astype(str),
                    drop_first=False,
                ).astype("int8").values,
                pd.get_dummies(product.item_id.astype(str), drop_first=False
                               ).astype("int8").values,
                pd.get_dummies(
                    product.state_id.astype(str) + product.item_id.astype(str),
                    drop_first=False,
                ).astype("int8").values,
                np.identity(NUM_ITEMS).astype(np.int8),  # item :level 12
            ].T

            self.weight_mat_csr = sparse.csr_matrix(weight_mat)
            sparse.save_npz("evaluator_weight_mat_csr", self.weight_mat_csr)
            del weight_mat

            self.weight1, self.weight2 = weight_calc(self.weight_mat_csr,
                                                     self.data, product)
            np.save("evaluator_weight1", self.weight1)
            np.save("evaluator_weight2", self.weight2)
Example #12
0
# ===============
# Main
# ===============
s = time.time()

with timer('load data', logging):
    with open('./data/else/col2path.pkl', 'rb') as f:
        col2path = pickle.load(f)
    X_train_all, X_test = load_datasets(FEATURES, col2path)
    y_train_all = load_target(TARGET_NAME)
    logging.debug(f'feature num: {len(X_train_all.columns)}')

with timer('reduce_mem_usage', logging):
    if REDUCE:
        X_train_all = reduce_mem_usage(X_train_all)
        X_test = reduce_mem_usage(X_test)

with timer(f'load {FOLD_PATH.split("/")[-1]}', logging):
    folds = pd.read_feather(FOLD_PATH)
    n_splits = folds['fold_id'].max() + 1

# with timer('concat oof', logging):
#     path = './features/lgbm_{data}.feather'
#     lgbm_pred_train = pd.read_feather(path.format(data='train'))
#     lgbm_pred_test = pd.read_feather(path.format(data='test'))

# with timer('concat diff', logging):
#     path = './features/lgbm_diff_{data}.feather'
#     lgbm_pred_train = pd.read_feather(path.format(data='train'))
#     lgbm_pred_test = pd.read_feather(path.format(data='test'))
    def run(self):
        if self.nrows == 0:
            self.nrows = None

        sales_train_validation = pd.read_csv(
            "../input/m5-forecasting-accuracy/sales_train_validation.csv",
            nrows=self.nrows,
        )

        # christmasの日は外れ値なので落とす
        print("クリスマスはおとすぜ")
        calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv",
                               usecols=["date", "d"])
        christmas_day = calendar.loc[calendar["date"].str.contains("12-25"),
                                     "d"].tolist()
        sales_train_validation = sales_train_validation.drop(
            columns=christmas_day)
        del calendar, christmas_day

        sales_train_validation = pd.melt(
            sales_train_validation,
            id_vars=[
                "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
            ],
            var_name="day",
            value_name="sales",
        )
        print(
            "Melted sales train validation has {} rows and {} columns".format(
                sales_train_validation.shape[0],
                sales_train_validation.shape[1]))
        sales_train_validation = reduce_mem_usage(sales_train_validation)

        submission = pd.read_csv(
            "../input/m5-forecasting-accuracy/sample_submission.csv")
        # seperate test dataframes
        test1_rows = [row for row in submission["id"] if "validation" in row]
        test2_rows = [row for row in submission["id"] if "evaluation" in row]
        test1 = submission[submission["id"].isin(test1_rows)]
        test2 = submission[submission["id"].isin(test2_rows)]

        # change column names
        test1.columns = [
            "id",
            "d_1914",
            "d_1915",
            "d_1916",
            "d_1917",
            "d_1918",
            "d_1919",
            "d_1920",
            "d_1921",
            "d_1922",
            "d_1923",
            "d_1924",
            "d_1925",
            "d_1926",
            "d_1927",
            "d_1928",
            "d_1929",
            "d_1930",
            "d_1931",
            "d_1932",
            "d_1933",
            "d_1934",
            "d_1935",
            "d_1936",
            "d_1937",
            "d_1938",
            "d_1939",
            "d_1940",
            "d_1941",
        ]
        test2.columns = [
            "id",
            "d_1942",
            "d_1943",
            "d_1944",
            "d_1945",
            "d_1946",
            "d_1947",
            "d_1948",
            "d_1949",
            "d_1950",
            "d_1951",
            "d_1952",
            "d_1953",
            "d_1954",
            "d_1955",
            "d_1956",
            "d_1957",
            "d_1958",
            "d_1959",
            "d_1960",
            "d_1961",
            "d_1962",
            "d_1963",
            "d_1964",
            "d_1965",
            "d_1966",
            "d_1967",
            "d_1968",
            "d_1969",
        ]
        product = sales_train_validation[[
            "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
        ]].drop_duplicates()

        # merge with product table
        test2["id"] = test2["id"].str.replace("_evaluation", "_validation")
        test1 = test1.merge(product, how="left", on="id")
        test2 = test2.merge(product, how="left", on="id")
        test2["id"] = test2["id"].str.replace("_validation", "_evaluation")

        #
        test1 = pd.melt(
            test1,
            id_vars=[
                "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
            ],
            var_name="day",
            value_name="sales",
        )
        test2 = pd.melt(
            test2,
            id_vars=[
                "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
            ],
            var_name="day",
            value_name="sales",
        )

        sales_train_validation["part"] = "train"
        test1["part"] = "test1"
        test2["part"] = "test2"

        data = pd.concat([sales_train_validation, test1, test2], axis=0)

        del sales_train_validation, test1, test2

        data = data.loc[40500000:]

        # drop some calendar features
        calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv")
        calendar = reduce_mem_usage(calendar)
        calendar.drop(["weekday", "wday", "month", "year"],
                      inplace=True,
                      axis=1)

        # delete test2 for now
        data = data[data["part"] != "test2"]

        data = pd.merge(data,
                        calendar,
                        how="left",
                        left_on=["day"],
                        right_on=["d"])
        data.drop(["day"], inplace=True, axis=1)
        data["d"] = data["d"].map(lambda x: int(x.split("_")[1]))
        # get the sell price data (this feature should be very important)
        sell_prices = pd.read_csv(
            "../input/m5-forecasting-accuracy/sell_prices.csv")
        sell_prices = reduce_mem_usage(sell_prices)
        data = data.merge(sell_prices,
                          on=["store_id", "item_id", "wm_yr_wk"],
                          how="left")
        print("Our final dataset to train has {} rows and {} columns".format(
            data.shape[0], data.shape[1]))

        self.dump(data)