Exemple #1
0
def target():
    ID = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "id.pkl")
    sales = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "sales.pkl")
    df = pd.DataFrame({"id": ID, "sales": sales})

    df = reduce_mem_usage(df)

    df["shift_28_roll_365"] = df.groupby(
        ["id"])["sales"].transform(lambda x: x.shift(28).rolling(365).mean())
    df["sales_residual_diff_28_roll_365"] = df["sales"] - df[
        "shift_28_roll_365"]

    df["shift_28_roll_365"].to_pickle(jsn['FEATURE_TARGET_DIR'] +
                                      "shift_28_roll_365.pkl")
    df["sales_residual_diff_28_roll_365"].to_pickle(
        jsn['FEATURE_TARGET_DIR'] + "sales_residual_diff_28_roll_365.pkl")

    del df, ID, sales
    gc.collect()
def make_lag_df():
    '''
    Create data frames for Lag features
    Create 28 dataframes from day 1 to day 28
    '''
    ################################# Make DataFrame
    #################################################################################
    for SHIFT_DAY in range(1, 29):

        if (SHIFT_DAY >= 1) & (SHIFT_DAY <= 7):
            LAG_DAY = 7
        elif (SHIFT_DAY >= 8) & (SHIFT_DAY <= 14):
            LAG_DAY = 14
        elif (SHIFT_DAY >= 15) & (SHIFT_DAY <= 21):
            LAG_DAY = 21
        elif (SHIFT_DAY >= 22) & (SHIFT_DAY <= 28):
            LAG_DAY = 28

        ##Basic
        #folder select
        folder1 = jsn["FEATURE_BASIC_DIR"]
        folder2 = jsn["FEATURE_ENCODING_DIR"]
        folder4 = jsn["FEATURE_LAG_DIR"]

        df = pd.DataFrame([])

        ######file select

        print(f"make Lag-dataframe for day{SHIFT_DAY}")
        if SHIFT_DAY in [7, 14, 21, 28]:
            flist4 = []
            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [2, 3, 4, 8, 12]:
                    for func in [
                            "mean",
                    ]:
                        flist4.append(
                            f"multi_7_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                        )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [
                        4,
                        8,
                ]:
                    for func in ["max", "min"]:
                        flist4.append(
                            f"multi_7_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                        )

            for shift in [
                    SHIFT_DAY,
            ]:
                for multi in [2, 3, 5]:
                    for roll in [3, 6, 10]:
                        for func in [
                                "mean",
                        ]:
                            flist4.append(
                                f"multi_{multi}_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                            )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 14, 30, 60]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_mean.pkl"
                    )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_std.pkl"
                    )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_median.pkl"
                    )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_max.pkl"
                    )

            for shift in [LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_min.pkl"
                    )

            s = SHIFT_DAY
            for shift in [
                    s,
                    s + 1,
                    s + 2,
                    s + 3,
                    s + 4,
                    s + 5,
                    s + 6,
                    s + 7,
                    s + 8,
                    s + 9,
                    s + 10,
                    s + 11,
                    s + 12,
                    s + 13,
            ]:
                flist4.append(
                    f"sales_residual_diff_28_roll_365_shift_{shift}.pkl")
        else:

            flist4 = []
            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [2, 3, 4, 8, 12]:
                    for func in [
                            "mean",
                    ]:
                        flist4.append(
                            f"multi_7_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                        )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [
                        4,
                        8,
                ]:
                    for func in ["max", "min"]:
                        flist4.append(
                            f"multi_7_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                        )

            for shift in [
                    SHIFT_DAY,
            ]:
                for multi in [2, 3, 5]:
                    for roll in [3, 6, 10]:
                        for func in [
                                "mean",
                        ]:
                            flist4.append(
                                f"multi_{multi}_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_{func}.pkl"
                            )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 14, 30, 60]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_mean.pkl"
                    )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_std.pkl"
                    )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_median.pkl"
                    )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_max.pkl"
                    )

            for shift in [SHIFT_DAY, LAG_DAY, LAG_DAY + 7]:
                for roll in [7, 30]:
                    flist4.append(
                        f"multi_1_sales_residual_diff_28_roll_365_shift_{shift}_roll_{roll}_min.pkl"
                    )

            s = SHIFT_DAY
            for shift in [
                    s,
                    s + 1,
                    s + 2,
                    s + 3,
                    s + 4,
                    s + 5,
                    s + 6,
                    s + 7,
                    s + 8,
                    s + 9,
                    s + 10,
                    s + 11,
                    s + 12,
                    s + 13,
            ]:
                flist4.append(
                    f"sales_residual_diff_28_roll_365_shift_{shift}.pkl")

        folders = [
            #         folder1,
            #         folder2,
            #         folder3,
            folder4,
        ]

        flists = [
            #         flist1,
            #         flist2,
            #         flist3,
            flist4,
        ]

        #make DATAFRAME
        for folder, flist in zip(folders, flists):
            for filename in flist:
                row = pd.read_pickle(folder + filename)
                filename = filename[:-4]
                df[filename] = row
            print("{}".format(folder))

    #     sales = pd.read_pickle("features/Target/sales_residual_diff_28_roll_365.pkl")
    #     df["sales"] = sales

        df = reduce_mem_usage(df)

        #     df.to_pickle(f"dataframe/data_base_{term}_df.pkl")
        df.to_pickle(jsn["DATAFRAME_DIR"] + f"data_day{SHIFT_DAY}_df.pkl")

        print(f"Lag_DataFrame_day{SHIFT_DAY}")
def make_base_df():
    '''
    Create a basic data frame for training

    '''
    ################################# Make DataFrame
    #################################################################################
    for term in ["private", "public", "validation", "semival"]:

        print(f"make Basic-dataframe for {term}")
        ##Basic
        #folder select
        folder1 = jsn["FEATURE_BASIC_DIR"]
        folder2 = jsn["FEATURE_ENCODING_DIR"]
        folder4 = jsn["FEATURE_LAG_DIR"]

        df = pd.DataFrame([])

        ######file select
        ## BASIC
        flist1 = [
            'month.pkl',
            'day.pkl',
            'price_momentum_y_item_store.pkl',
            #              'w_serial.pkl',
            'item_id.pkl',
            #              'd_serial.pkl',
            #              'snap_TX.pkl',
            #              '.DS_Store',
            'last_sales.pkl',
            #              'snap_CA.pkl',
            'is_weekend.pkl',
            #              'sales.pkl',
            'sell_start_log.pkl',
            'sell_price_minority12.pkl',
            'week_of_year.pkl',
            'price_momentum_m_item_state.pkl',
            'Mothers_day.pkl',
            'year.pkl',
            'IndependenceDay.pkl',
            'olympic_president_elec_year.pkl',
            'event_name_1.pkl',
            'dept_id.pkl',
            'event_name_2.pkl',
            'price_unique_item_state.pkl',
            'date.pkl',
            'id.pkl',
            'moon.pkl',
            'sell_price.pkl',
            'national_holiday.pkl',
            'state_id.pkl',
            #              'event_type_1.pkl',
            'price_momentum_y_item_state.pkl',
            'event_type_statecat_labelenc.pkl',
            'event_type_2.pkl',
            'store_id.pkl',
            'snap_total.pkl',
            'cat_id.pkl',
            #              'snap_WI.pkl',
            'price_unique_item_store.pkl',
            'sell_start.pkl',
            #              'weekday.pkl',
            'week_of_month.pkl',
            'Easter.pkl',
            'wday.pkl',
            'price_momentum_m_item_store.pkl',
            'OrthodoxEaster.pkl',
            'id_serial.pkl',
            'Ramadan_Starts.pkl',
            'NBA_finals.pkl',
            #              'wm_yr_wk.pkl',
            #              'd.pkl',
        ]

        # Encoding
        flist2 = []
        for level in [f"LEVEL{i}" for i in range(2, 13)]:
            flist2.append(
                f"{term}_sales_residual_diff_28_roll_365_enc_{level}_mean.pkl")
            flist2.append(
                f"{term}_sales_residual_diff_28_roll_365_enc_{level}_std.pkl")
        for diff in [28]:
            for level in [f"LEVEL{i}" for i in range(2, 13)]:
                for wd in ["week", "day"]:
                    for func in ["mean", "std"]:
                        flist2.append(
                            f"{term}_sales_residual_diff_{diff}_roll_365_enc_{wd}_{level}_{func}.pkl"
                        )

        folders = [
            folder1,
            folder2,
            #         folder3,
            #     folder4,
        ]

        flists = [
            flist1,
            flist2,
            #         flist3,
            #     flist4,
        ]

        #make DATAFRAME
        for folder, flist in zip(folders, flists):
            for filename in flist:
                row = pd.read_pickle(folder + filename)
                filename = filename[:-4]
                df[filename] = row
            print("{}".format(folder))

        sales = pd.read_pickle(jsn["FEATURE_TARGET_DIR"] +
                               "sales_residual_diff_28_roll_365.pkl")
        df["sales"] = sales

        df = reduce_mem_usage(df)

        order = [
            'id_serial',
            'id',
            'is_weekend',
            'sell_start',
            'date',
            'snap_total',
            'day',
            'sell_price',
            'event_name_1',
            'week_of_month',
            'wday',
            'week_of_year',
            'sell_start_log',
            'Mothers_day',
            'national_holiday',
            'NBA_finals',
            'sell_price_minority12',
            'year',
            'month',
            'olympic_president_elec_year',
            'OrthodoxEaster',
            'store_id',
            'moon',
            'cat_id',
            'Ramadan_Starts',
            'IndependenceDay',
            'last_sales',
            'event_name_2',
            'event_type_2',
            'event_type_statecat_labelenc',
            'Easter',
            'item_id',
            'dept_id',
            'state_id',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL2_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL2_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL3_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL3_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL4_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL4_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL5_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL5_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL6_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL6_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL7_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL7_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL8_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL8_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL9_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL9_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL10_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL10_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL11_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL11_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL12_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_LEVEL12_std',
            'price_unique_item_state',
            'price_momentum_m_item_state',
            'price_momentum_y_item_state',
            'price_unique_item_store',
            'price_momentum_m_item_store',
            'price_momentum_y_item_store',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL2_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL2_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL2_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL2_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL3_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL3_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL3_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL3_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL4_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL4_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL4_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL4_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL5_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL5_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL5_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL5_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL6_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL6_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL6_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL6_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL7_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL7_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL7_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL7_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL8_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL8_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL8_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL8_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL9_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL9_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL9_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL9_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL10_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL10_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL10_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL10_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL11_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL11_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL11_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL11_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL12_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_week_LEVEL12_std',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL12_mean',
            f'{term}_sales_residual_diff_28_roll_365_enc_day_LEVEL12_std',
            'sales',
        ]

        df = df[order]

        df.to_pickle(jsn["DATAFRAME_DIR"] + f"data_base_{term}_df.pkl")

        print(f"Base_DataFrame_{term}")
Exemple #4
0
def lag():
    ###### LAG Rolling
    #############################

    ID = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "id.pkl")
    d_serial = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "d_serial.pkl")
    sales = pd.read_pickle(jsn['FEATURE_TARGET_DIR'] +
                           "sales_residual_diff_28_roll_365.pkl")
    df = pd.DataFrame({"id": ID, "d_serial": d_serial, "sales": sales})
    df = reduce_mem_usage(df)

    #### shift 1~ 56
    for shift_day in range(1, 57):
        df.groupby(
            ["id"])["sales"].transform(lambda x: x.shift(shift_day)).to_pickle(
                jsn["FEATURE_LAG_DIR"] +
                f"sales_residual_diff_28_roll_365_shift_{shift_day}.pkl")

    #### rolling
    ####################
    for i in [1, 2, 3, 5, 7]:
        df[f"multi_{i}"] = df["d_serial"].transform(lambda x: x % i)

    for shift_day in range(1, 36):
        target = f"sales_residual_diff_28_roll_365_shift_{shift_day}"

        df[target] = pd.read_pickle(
            jsn["FEATURE_LAG_DIR"] +
            f"sales_residual_diff_28_roll_365_shift_{shift_day}.pkl")
        #### multi 2, 3, 5 rolling
        for roll_wind in [3, 6, 10]:
            df.groupby([
                'id', 'multi_2'
            ])[target].transform(lambda x: x.rolling(roll_wind).mean(
            )).to_pickle(
                jsn["FEATURE_LAG_DIR"] +
                f"multi_2_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
            )
            df.groupby([
                'id', 'multi_3'
            ])[target].transform(lambda x: x.rolling(roll_wind).mean(
            )).to_pickle(
                jsn["FEATURE_LAG_DIR"] +
                f"multi_3_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
            )
            df.groupby([
                'id', 'multi_5'
            ])[target].transform(lambda x: x.rolling(roll_wind).mean(
            )).to_pickle(
                jsn["FEATURE_LAG_DIR"] +
                f"multi_5_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
            )

        #### multi 7 rolling
        for roll_wind in [2, 3, 4, 8, 12]:
            if roll_wind in [4, 8]:
                df.groupby([
                    'id', 'multi_7'
                ])[target].transform(lambda x: x.rolling(roll_wind).mean(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_7_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
                )
                df.groupby([
                    'id', 'multi_7'
                ])[target].transform(lambda x: x.rolling(roll_wind).max(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_7_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_max.pkl"
                )
                df.groupby([
                    'id', 'multi_7'
                ])[target].transform(lambda x: x.rolling(roll_wind).min(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_7_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_min.pkl"
                )
            else:
                df.groupby([
                    'id', 'multi_7'
                ])[target].transform(lambda x: x.rolling(roll_wind).mean(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_7_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
                )

        #### multi 1 rolling
        for roll_wind in [7, 14, 30, 60]:
            if roll_wind in [7, 30]:
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).mean(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
                )
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).std(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_std.pkl"
                )
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).max(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_max.pkl"
                )
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).min(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_min.pkl"
                )
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).median(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_median.pkl"
                )
            else:
                df.groupby([
                    'id', 'multi_1'
                ])[target].transform(lambda x: x.rolling(roll_wind).mean(
                )).to_pickle(
                    jsn["FEATURE_LAG_DIR"] +
                    f"multi_1_sales_residual_diff_28_roll_365_shift_{shift_day}_roll_{roll_wind}_mean.pkl"
                )

        df = df.drop(target, axis=1)
Exemple #5
0
def encoding():

    ID = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "id.pkl")
    item_id = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "item_id.pkl")
    store_id = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "store_id.pkl")
    dept_id = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "dept_id.pkl")
    state_id = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "state_id.pkl")
    cat_id = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "cat_id.pkl")
    d_serial = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "d_serial.pkl")
    sell_price = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "sell_price.pkl")
    wday = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "wday.pkl")
    day = pd.read_pickle(jsn['FEATURE_BASIC_DIR'] + "day.pkl")
    sales = pd.read_pickle(jsn['FEATURE_TARGET_DIR'] +
                           "sales_residual_diff_28_roll_365.pkl")

    df = pd.DataFrame({
        "id": ID,
        "item_id": item_id,
        "store_id": store_id,
        "dept_id": dept_id,
        "state_id": state_id,
        "cat_id": cat_id,
        "d_serial": d_serial,
        "sell_price": sell_price,
        "wday": wday,
        "day": day,
        "sales": sales
    })

    df = reduce_mem_usage(df)

    df["flag"] = 0
    df.loc[df["sell_price"] >= 0, "flag"] = 1
    df.loc[df["flag"] == 0, "sales"] = np.nan

    pred_terms = ["private", "public", "validation", "semival"]

    for i, term in enumerate(pred_terms):

        df.loc[df["d_serial"] >= 1942 - i * 28, "sales"] = np.nan

        LEVEL = {
            "LEVEL2": ["state_id"],
            "LEVEL3": ["store_id"],
            "LEVEL4": ["cat_id"],
            "LEVEL5": ["dept_id"],
            "LEVEL6": ["state_id", "cat_id"],
            "LEVEL7": ["state_id", "dept_id"],
            "LEVEL8": ["store_id", "cat_id"],
            "LEVEL9": ["store_id", "dept_id"],
            "LEVEL10": ["item_id"],
            "LEVEL11": ["state_id", "item_id"],
            "LEVEL12": ["store_id", "item_id"]
        }

        for key, value in LEVEL.items():
            df.groupby(value + ["wday"])["sales"].transform(np.mean).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_week_{key}_mean.pkl"
            )
            df.groupby(value + ["wday"])["sales"].transform(np.std).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_week_{key}_std.pkl"
            )
            df.groupby(value + ["day"])["sales"].transform(np.mean).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_day_{key}_mean.pkl"
            )
            df.groupby(value + ["day"])["sales"].transform(np.std).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_day_{key}_std.pkl"
            )
            df.groupby(value)["sales"].transform(np.mean).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_{key}_mean.pkl")
            df.groupby(value)["sales"].transform(np.std).to_pickle(
                jsn["FEATURE_ENCODING_DIR"] +
                f"{term}_sales_residual_diff_28_roll_365_enc_{key}_std.pkl")

    del df, ID, item_id, store_id, dept_id, state_id, cat_id, d_serial, sell_price, wday, day, sales
    gc.collect()
Exemple #6
0
def basic():

    train_val_df = pd.read_csv(jsn["TRAIN_DATA_PATH"])
    sell_price_df = pd.read_csv(jsn["SELL_PRICES_PATH"])
    calendar_df = pd.read_csv(jsn["CALENDAR_PATH"])

    ### Basic Category ID
    day_list = []
    for i in range(1942, 1970):
        day_list.append("d_{}".format(i))
    for day in day_list:
        train_val_df[day] = np.nan

    train_val_df["id_serial"] = list(range(30490))

    melt_sales = pd.melt(train_val_df,
                         id_vars=[
                             'id_serial', 'id', 'item_id', 'dept_id', 'cat_id',
                             'store_id', 'state_id'
                         ],
                         var_name="d",
                         value_name="sales")

    ####### Calendar feature
    #event2
    calendar_df.loc[calendar_df["event_name_2"] == "Cinco De Mayo",
                    "event_name_2"] = 0
    calendar_df.loc[calendar_df["event_name_2"] == "Easter",
                    "event_name_2"] = 1
    calendar_df.loc[calendar_df["event_name_2"] == "Father's day",
                    "event_name_2"] = 2
    calendar_df.loc[calendar_df["event_name_2"] == "OrthodoxEaster",
                    "event_name_2"] = 3
    calendar_df.loc[calendar_df["event_type_2"] == "Cultural",
                    "event_type_2"] = 0
    calendar_df.loc[calendar_df["event_type_2"] == "Religious",
                    "event_type_2"] = 1
    calendar_df["event_name_2"] = calendar_df["event_name_2"].astype(
        np.float16)
    calendar_df["event_type_2"] = calendar_df["event_type_2"].astype(
        np.float16)

    #weekend
    calendar_df["is_weekend"] = 0
    calendar_df.loc[calendar_df["weekday"] == "Saturday", "is_weekend"] = 1
    calendar_df.loc[calendar_df["weekday"] == "Sunday", "is_weekend"] = 1

    # d_serial
    calendar_df["d_serial"] = calendar_df["d"].apply(lambda x: int(x[2:]))

    # w_serial
    calendar_df["w_serial"] = 0
    cnt = 1
    for i in range(calendar_df.shape[0]):
        if i % 7 == 0:
            cnt += 1
        calendar_df.loc[i, "w_serial"] = cnt

    # date
    calendar_df['date'] = pd.to_datetime(calendar_df['date'],
                                         format='%Y-%m-%d')

    # day
    calendar_df["day"] = calendar_df["date"].apply(lambda x: x.day)
    calendar_df["month"] = calendar_df["date"].apply(lambda x: x.month)
    calendar_df["year"] = calendar_df["date"].apply(lambda x: x.year)

    # Mooon
    dec = decimal.Decimal

    def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
        diff = d - datetime(2001, 1, 1)
        days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
        lunations = dec("0.20439731") + (days * dec("0.03386319269"))
        phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
        return int(phase_index) % 8

    calendar_df['moon'] = calendar_df.date.apply(get_moon_phase)

    # week of month, year
    calendar_df[
        "week_of_month"] = calendar_df["w_serial"] - calendar_df.groupby(
            ["year", "month"])["w_serial"].transform("min") + 1
    calendar_df[
        "week_of_year"] = calendar_df["w_serial"] - calendar_df.groupby(
            ["year"])["w_serial"].transform("min") + 1

    # olympic_president_elec_year
    calendar_df["olympic_president_elec_year"] = 0
    calendar_df.loc[calendar_df["year"] == 2012,
                    "olympic_president_elec_year"] = 1
    calendar_df.loc[calendar_df["year"] == 2016,
                    "olympic_president_elec_year"] = 1

    # NBA_Finals
    calendar_df["NBA_finals"] = 0
    for day in [
            "d_123", "d_124", "d_125", "d_126", "d_127", "d_128", "d_129",
            "d_130", "d_131", "d_132", "d_133", "d_134", "d_135", "d_501",
            "d_502", "d_503", "d_504", "d_505", "d_506", "d_507", "d_508",
            "d_509", "d_510", "d_860", "d_861", "d_862", "d_863", "d_864",
            "d_865", "d_866", "d_867", "d_868", "d_869", "d_870", "d_871",
            "d_872", "d_873", "d_874", "d_1224", "d_1225", "d_1226", "d_1227",
            "d_1228", "d_1229", "d_1230", "d_1231", "d_1232", "d_1233",
            "d_1234", "d_1588", "d_1589", "d_1590", "d_1591", "d_1592",
            "d_1593", "d_1594", "d_1595", "d_1596", "d_1597", "d_1598",
            "d_1599", "d_1600", "d_1952", "d_1953", "d_1954", "d_1955",
            "d_1956", "d_1957", "d_1958", "d_1959", "d_1960", "d_1961",
            "d_1962", "d_1963", "d_1964", "d_1965", "d_1966", "d_1967",
            "d_1968", "d_1969"
    ]:
        calendar_df.loc[calendar_df["d"] == day, "NBA_finals"] = 1

    # Ramadan start
    day_list = []
    for i in range(30):
        day_list.append("d_{}".format(i + 185))
        day_list.append("d_{}".format(i + 539))
        day_list.append("d_{}".format(i + 893))
        day_list.append("d_{}".format(i + 1248))
        day_list.append("d_{}".format(i + 1602))
    for i in range(13):
        day_list.append("d_{}".format(i + 1957))
    day_list

    calendar_df["Ramadan_Starts"] = 0
    for day in day_list:
        calendar_df.loc[calendar_df["d"] == day, "Ramadan_Starts"] = 1

    # Mothers day
    day_list_1 = ["d_100", "d_471", "d_835", "d_1199", "d_1563", "d_1927"]
    day_list_2 = ["d_99", "d_470", "d_834", "d_1198", "d_1562", "d_1926"]
    calendar_df["Mothers_day"] = 0
    for day in day_list_1:
        calendar_df.loc[calendar_df["d"] == day, "Mothers_day"] = 1
    for day in day_list_2:
        calendar_df.loc[calendar_df["d"] == day, "Mothers_day"] = 2

    # OrthodoxEaster
    day_list_1 = ["d_86", "d_443", "d_828", "d_1178", "d_1535", "d_1920"]
    day_list_2 = ["d_85", "d_442", "d_827", "d_1177", "d_1534", "d_1919"]
    calendar_df["OrthodoxEaster"] = 0
    for day in day_list_1:
        calendar_df.loc[calendar_df["d"] == day, "OrthodoxEaster"] = 1
    for day in day_list_2:
        calendar_df.loc[calendar_df["d"] == day, "OrthodoxEaster"] = 2

    # Easter
    day_list_1 = ["d_86", "d_436", "d_793", "d_1178", "d_1528", "d_1885"]
    day_list_2 = ["d_85", "d_435", "d_792", "d_1177", "d_1527", "d_1884"]
    calendar_df["Easter"] = 0
    for day in day_list_1:
        calendar_df.loc[calendar_df["d"] == day, "Easter"] = 1
    for day in day_list_2:
        calendar_df.loc[calendar_df["d"] == day, "Easter"] = 2

    # IndependenceDay
    day_list_1 = ["d_151", "d_517", "d_882", "d_1247", "d_1612"]
    day_list_2 = ["d_152", "d_518", "d_883", "d_1248", "d_1613"]
    day_list_3 = ["d_153", "d_519", "d_884", "d_1249", "d_1614"]
    day_list_4 = ["d_154", "d_520", "d_885", "d_1250", "d_1615"]
    day_list_5 = ["d_155", "d_521", "d_886", "d_1251", "d_1616"]
    day_list_6 = ["d_156", "d_522", "d_887", "d_1252", "d_1617"]
    day_list_7 = ["d_157", "d_523", "d_888", "d_1253", "d_1618"]
    day_list_8 = ["d_158", "d_524", "d_889", "d_1254", "d_1619"]
    day_list_9 = ["d_159", "d_525", "d_890", "d_1255", "d_1620"]
    day_list_10 = ["d_160", "d_526", "d_891", "d_1256", "d_1621"]

    calendar_df["IndependenceDay"] = 0
    for day in day_list_1:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 1
    for day in day_list_2:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 2
    for day in day_list_3:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 3
    for day in day_list_4:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 4
    for day in day_list_5:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 5
    for day in day_list_6:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 6
    for day in day_list_7:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 7
    for day in day_list_8:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 8
    for day in day_list_9:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 9
    for day in day_list_10:
        calendar_df.loc[calendar_df["d"] == day, "IndependenceDay"] = 10

    ###### price feature
    ##############################
    sell_price_df["state_id"] = sell_price_df["store_id"].apply(
        lambda x: x[:2])
    sell_price_df["dept_id"] = sell_price_df["item_id"].apply(lambda x: x[:-4])
    sell_price_df["cat_id"] = sell_price_df["dept_id"].apply(lambda x: x[:-2])

    sell_price_df["price_unique_item_state"] = sell_price_df.groupby(
        ['state_id', 'item_id'])['sell_price'].transform('nunique')
    sell_price_df["price_unique_item_store"] = sell_price_df.groupby(
        ['store_id', 'item_id'])['sell_price'].transform('nunique')

    calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
    calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
    sell_price_df = sell_price_df.merge(
        calendar_prices[['wm_yr_wk', 'month', 'year']],
        on=['wm_yr_wk'],
        how='left')

    sell_price_df['price_momentum_m_item_state'] = sell_price_df[
        'sell_price'] / sell_price_df.groupby(
            ['state_id', 'item_id', 'month'])['sell_price'].transform('mean')
    sell_price_df['price_momentum_y_item_state'] = sell_price_df[
        'sell_price'] / sell_price_df.groupby(
            ['state_id', 'item_id', 'year'])['sell_price'].transform('mean')
    sell_price_df['price_momentum_m_item_store'] = sell_price_df[
        'sell_price'] / sell_price_df.groupby(
            ['store_id', 'item_id', 'month'])['sell_price'].transform('mean')
    sell_price_df['price_momentum_y_item_store'] = sell_price_df[
        'sell_price'] / sell_price_df.groupby(
            ['store_id', 'item_id', 'year'])['sell_price'].transform('mean')

    # sell_start
    sell_price_df["one"] = 1
    sell_price_df["sell_start"] = sell_price_df.groupby(
        ["store_id", "item_id"])["one"].transform(lambda x: x.cumsum())
    sell_price_df["sell_start_log"] = sell_price_df.groupby(
        ["store_id", "item_id"])["sell_start"].transform(lambda x: np.log(x))

    sell_price_df = sell_price_df.drop(
        ["month", "year", "one", "state_id", "dept_id", "cat_id"], axis=1)

    ####### merge
    melt_sales = melt_sales.merge(calendar_df, on="d", how="left")
    melt_sales = pd.merge(melt_sales,
                          sell_price_df,
                          on=["store_id", "item_id", "wm_yr_wk"],
                          how='left')

    ####### save by column
    melt_sales = reduce_mem_usage(melt_sales)
    for col in melt_sales.columns:
        melt_sales[col].to_pickle(jsn['FEATURE_BASIC_DIR'] + f"{col}.pkl")

    del melt_sales, sell_price_df, calendar_df, train_val_df
    gc.collect()
Exemple #7
0
def total_weight():
    ### make WEIGHT_SCALED_42840, WEIGHT_FORMAT, WEIGHT_SCALED_30490

    train_val_df = pd.read_csv(jsn["TRAIN_DATA_PATH"])
    sell_price_df = pd.read_csv(jsn["SELL_PRICES_PATH"])
    calendar_df = pd.read_csv(jsn["CALENDAR_PATH"])

    day_list = []
    for i in range(1942, 1970):
        day_list.append("d_{}".format(i))
    for day in day_list:
        train_val_df[day] = np.nan

    melt_sales = pd.melt(
        train_val_df,
        id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
        var_name="day",
        value_name="sales")
    calendar_df['date'] = pd.to_datetime(calendar_df['date'],
                                         format='%Y-%m-%d')
    melt_sales = melt_sales.merge(calendar_df, left_on="day", right_on="d")
    melt_sales = pd.merge(melt_sales,
                          sell_price_df,
                          on=["store_id", "item_id", "wm_yr_wk"],
                          how='left')

    melt_sales = melt_sales[["id", "date", "sales", "sell_price"]]

    melt_sales = reduce_mem_usage(melt_sales)

    ############## Creat mat
    # train_val_df = train_val_df[train_val_df["distribution"]=="poisson"]

    NUM_ITEMS = train_val_df.shape[0]

    product = train_val_df[[
        'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
    ]].drop_duplicates()

    # Creating weight mats
    weight_mat = np.c_[
        np.ones([NUM_ITEMS, 1]).astype(np.int8),  # level 1
        pd.get_dummies(product.state_id.astype(str), drop_first=False
                       ).astype('int8').values,
        pd.get_dummies(product.store_id.astype(str), drop_first=False
                       ).astype('int8').values,
        pd.get_dummies(product.cat_id.astype(str), drop_first=False
                       ).astype('int8').values,
        pd.get_dummies(product.dept_id.astype(str), drop_first=False
                       ).astype('int8').values,
        pd.get_dummies(product.state_id.astype(str) +
                       product.cat_id.astype(str),
                       drop_first=False).astype('int8').values,
        pd.get_dummies(product.state_id.astype(str) +
                       product.dept_id.astype(str),
                       drop_first=False).astype('int8').values,
        pd.get_dummies(product.store_id.astype(str) +
                       product.cat_id.astype(str),
                       drop_first=False).astype('int8').values,
        pd.get_dummies(product.store_id.astype(str) +
                       product.dept_id.astype(str),
                       drop_first=False).astype('int8').values,
        pd.get_dummies(product.item_id.astype(str), drop_first=False
                       ).astype('int8').values,
        pd.get_dummies(product.state_id.astype(str) +
                       product.item_id.astype(str),
                       drop_first=False).astype('int8').values,
        np.identity(NUM_ITEMS).astype(np.int8)  #item :level 12
    ].T

    np.save(jsn["WEIGHTS_DIR"] + "weight_mat_total.npy", weight_mat)
    weight_mat_csr = csr_matrix(weight_mat)

    #     del train_val_df, sell_price_df, calendar_df, weight_mat
    #     gc.collect()

    ############## Loss function weights are calculated and stored.
    weight1, weight2 = weight_calc(melt_sales, product, weight_mat_csr)
    # del df, train_val_df; gc.collect()
    #     del melt_sales; gc.collect()

    ############## SAVE WEIGHT

    np.save(jsn["WEIGHTS_DIR"] + "weight1_total.npy", weight1)
    np.save(jsn["WEIGHTS_DIR"] + "weight2_total.npy", weight2)

    # train_val_df = pd.read_csv('/Users/hiroaki_ikeshita/myfolder/diveintocode-ml/Kaggle/Walmart/m5-forecasting-accuracy/sales_train_evaluation.csv')
    # product = train_val_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

    df = pd.DataFrame(np.ones([30490, 1]).astype(np.int8),
                      index=product.index,
                      columns=["total"])
    df = pd.concat(
        (df, pd.get_dummies(product.state_id.astype(str),
                            drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df, pd.get_dummies(product.store_id.astype(str),
                            drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df, pd.get_dummies(product.cat_id.astype(str),
                            drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df, pd.get_dummies(product.dept_id.astype(str),
                            drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df,
         pd.get_dummies(
             product.state_id.astype(str) + product.cat_id.astype(str),
             drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df,
         pd.get_dummies(
             product.state_id.astype(str) + product.dept_id.astype(str),
             drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df,
         pd.get_dummies(
             product.store_id.astype(str) + product.cat_id.astype(str),
             drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df,
         pd.get_dummies(
             product.store_id.astype(str) + product.dept_id.astype(str),
             drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df, pd.get_dummies(product.item_id.astype(str),
                            drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat(
        (df,
         pd.get_dummies(
             product.state_id.astype(str) + product.item_id.astype(str),
             drop_first=False).astype('int8')),
        axis=1)
    df = pd.concat((df,
                    pd.DataFrame(np.identity(30490).astype(np.int8),
                                 index=product.index,
                                 columns=product["id"])),
                   axis=1)

    df.index = product.id
    df = df.T

    df.to_pickle(jsn["WEIGHTS_DIR"] + "weight_size_change_format.pkl")

    weight_scaled = weight2**2 / weight1

    WEIGHT_SCALED_42840 = weight_scaled / weight_scaled[0]
    WEIGHT_SCALED_30490 = np.dot(WEIGHT_SCALED_42840, df.values)

    np.save(jsn["WEIGHTS_DIR"] + "WEIGHT_SCALED_42840.npy",
            WEIGHT_SCALED_42840)
    np.save(jsn["WEIGHTS_DIR"] + "WEIGHT_SCALED_30490.npy",
            WEIGHT_SCALED_30490)
Exemple #8
0
def weight_by_store():
    ################################# Make WEIGHT
    #################################################################################

    ############## LOAD ORIGIN DATA

    train_val_df_origin = pd.read_csv(jsn["TRAIN_DATA_PATH"])
    sell_price_df = pd.read_csv(jsn["SELL_PRICES_PATH"])
    calendar_df = pd.read_csv(jsn["CALENDAR_PATH"])

    #############  Make Weights
    day_list = []
    for i in range(1942, 1970):
        day_list.append("d_{}".format(i))
    for day in day_list:
        train_val_df_origin[day] = np.nan

    for category in CATEGORY_ID:

        train_val_df = train_val_df_origin[
            train_val_df_origin[f"{category_name}_id"] == category]

        melt_sales = pd.melt(train_val_df,
                             id_vars=[
                                 'id', 'item_id', 'dept_id', 'cat_id',
                                 'store_id', 'state_id'
                             ],
                             var_name="day",
                             value_name="sales")
        calendar_df['date'] = pd.to_datetime(calendar_df['date'],
                                             format='%Y-%m-%d')
        melt_sales = melt_sales.merge(calendar_df, left_on="day", right_on="d")
        melt_sales = pd.merge(melt_sales,
                              sell_price_df,
                              on=["store_id", "item_id", "wm_yr_wk"],
                              how='left')

        melt_sales = melt_sales[["id", "date", "sales", "sell_price"]]

        melt_sales = reduce_mem_usage(melt_sales)

        ############## Creat mat
        # train_val_df = train_val_df[train_val_df["distribution"]=="poisson"]

        NUM_ITEMS = train_val_df.shape[0]

        product = train_val_df[[
            'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
        ]].drop_duplicates()

        # Weight_MAT
        weight_mat = np.c_[
            np.ones([NUM_ITEMS, 1]).astype(np.int8),  # level 1
            pd.get_dummies(product.state_id.astype(str), drop_first=False
                           ).astype('int8').values,
            pd.get_dummies(product.store_id.astype(str), drop_first=False
                           ).astype('int8').values,
            pd.get_dummies(product.cat_id.astype(str), drop_first=False
                           ).astype('int8').values,
            pd.get_dummies(product.dept_id.astype(str), drop_first=False
                           ).astype('int8').values,
            pd.get_dummies(product.state_id.astype(str) +
                           product.cat_id.astype(str),
                           drop_first=False).astype('int8').values,
            pd.get_dummies(product.state_id.astype(str) +
                           product.dept_id.astype(str),
                           drop_first=False).astype('int8').values,
            pd.get_dummies(product.store_id.astype(str) +
                           product.cat_id.astype(str),
                           drop_first=False).astype('int8').values,
            pd.get_dummies(product.store_id.astype(str) +
                           product.dept_id.astype(str),
                           drop_first=False).astype('int8').values,
            pd.get_dummies(product.item_id.astype(str), drop_first=False
                           ).astype('int8').values,
            pd.get_dummies(product.state_id.astype(str) +
                           product.item_id.astype(str),
                           drop_first=False).astype('int8').values,
            np.identity(NUM_ITEMS).astype(np.int8)  #item :level 12
        ].T

        np.save(jsn["WEIGHTS_DIR"] + f"weight_mat_{category}.npy", weight_mat)
        weight_mat_csr = csr_matrix(weight_mat)

        #     del train_val_df, sell_price_df, calendar_df, weight_mat
        #     gc.collect()

        ############## Loss function weights are calculated and stored.
        weight1, weight2 = weight_calc(melt_sales, product, weight_mat_csr,
                                       category)
        # del df, train_val_df; gc.collect()
        #     del melt_sales; gc.collect()

        ############## SAVE WEIGHT

        np.save(jsn["WEIGHTS_DIR"] + f"weight1_{category}.npy", weight1)
        np.save(jsn["WEIGHTS_DIR"] + f"weight2_{category}.npy", weight2)