Exemple #1
0
    def recreate_past_forecasts(self, table_all_features, list_dwps, horizon=10, model_config=None):

        if model_config is None:
            print("model not specified - using default model")
            model_config = deepcopy(self.default_model_config)
            print(model_config)

        table_all_features['horizon'] = \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month)
        features_int = ["date_when_predicting", "label", "date_to_predict", "sku", "target", "country", "brand",
                        "tier", "stage", "horizon"]
        features = [x for x in table_all_features.keys() if x not in features_int]

        resfinal = pd.DataFrame()
        feature_importance_df_final = pd.DataFrame()
        for datwep in list_dwps:
            print("date when predicting: " + str(datwep))
            res = pd.DataFrame()
            feature_importance_df = pd.DataFrame()
            for h in range(1, horizon + 1):
                print("training model at horizon: " + str(h))
                subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull())
                                             & (table_all_features.date_to_predict <= datwep)]
                x_train = subdata[features].values
                y_train = subdata.target

                data_test = table_all_features[(table_all_features.date_when_predicting == datwep) &
                                               (table_all_features.horizon == h)].copy()

                x_test = data_test[features].values
                model = MLDCModel(
                    model_name=model_config.model_name,
                    model_params=model_config.model_params
                )

                model.fit(x_train, y_train)
                preds = model.predict(x_test)
                preds = preds.clip(min=0)
                data_test['horizon'] = h
                data_test['prediction'] = preds
                res = pd.concat([res, data_test[["label", "date_to_predict", "sku", "prediction", "horizon"]]])

                feature_importance = dict(
                    zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_))))
                feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T
                feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True)
            feature_importance_df['date_when_preidct'] = datwep
            self.feature_importance = feature_importance_df
            self.feature_importance.to_csv(str(datwep) + '_feature_importance.csv''')
            res = self.correct_fc(res, month_to_correct=[7, 'CNY', 11], thrsh=0.05)

            resfinal = pd.concat([resfinal, res])
            feature_importance_df_final = pd.concat([feature_importance_df_final,feature_importance_df])
            feature_importance_df_final.to_csv('./data/feature_importance_all_df.csv')

        return resfinal
def Generate_Forecast_IL(config, dwp_test, table_all_features, df):
    resfinal = pd.DataFrame()
    feature_importance_df_final = pd.DataFrame()

    # Filter features to train the model
    features = [
        x for x in table_all_features.keys() if x not in config["features_int"]
    ]
    features_cat = [
        c for c in table_all_features.keys()
        if ('spre' in c) | ('upre' in c) | ('mainstream' in c)
    ]

    for datwep in dwp_test:
        print(datwep)
        res = pd.DataFrame()
        feature_importance_df = pd.DataFrame()
        for h in range(1, config["horizon"] + 1):
            print("training model at horizon: " + str(h))

            subdata = table_all_features[
                (table_all_features.horizon == h)
                & (~table_all_features.target.isnull())
                & (table_all_features.date_to_predict <= datwep)]
            if not config["FirstRun"]:
                feature_importance_df_sets = Load_50_feature(config)
                features = list(feature_importance_df_sets[str(h)]) +\
                           features_cat +\
                           config["features_cat_col"] +\
                           config["features_cat_fsct_col"]

            x_train = subdata[features].values
            y_train = subdata.target

            data_test = table_all_features[
                (table_all_features.date_when_predicting == datwep)
                & (table_all_features.horizon == h)].copy()

            x_test = data_test[features].values

            #             model1 = MLDCModel(
            #                         model_name = config["model_config_XGBRegressor"].model_name,
            #                         model_params = config["model_config_XGBRegressor"].model_params)
            model = MLDCModel(
                model_name=config["model_config_RandomForestRegressor"].
                model_name,
                model_params=config["model_config_RandomForestRegressor"].
                model_params)
            #         model3 = MLDCModel(
            #                     model_name = config["model_config_GradientBoostingRegressor"].model_name,
            #                     model_params = config["model_config_GradientBoostingRegressor"].model_params)
            #         model4 = MLDCModel(
            #                     model_name = config["model_config_AdaBoostRegressor"].model_name,
            #                     model_params = config["model_config_AdaBoostRegressor"].model_params)

            #         model1.fit(x_train, y_train)
            #         model2.fit(x_train, y_train)
            #         model3.fit(x_train, y_train)
            #         model4.fit(x_train, y_train)

            #         preds1_train = model1.predict(x_train)
            #         preds2_train = model2.predict(x_train)
            #         preds3_train = model3.predict(x_train)
            #         preds4_train = model4.predict(x_train)

            #         preds1_test = model1.predict(x_test)
            #         preds2_test = model2.predict(x_test)
            #         preds3_test = model3.predict(x_test)
            #         preds4_test = model4.predict(x_test)

            #             param_grid = [{
            #                 'max_depth': [8],
            #                 'n_estimators': [80]}]

            #             KNN = KNeighborsRegressor()
            #             grid_search = GridSearchCV(KNN, param_grid, cv=5)
            #             grid_search.fit(np.column_stack((np.array([list(preds1_train),list(preds2_train),list(preds3_train),list(preds4_train)]).T,
            #                             pd.get_dummies(subdata['country_brand'],prefix = 'country_brand').values)), y_train)
            #             KNN_blending = grid_search.best_estimator_
            #             preds = KNN_blending.predict(np.column_stack((np.array([list(preds1_test),list(preds2_test),list(preds3_test),list(preds4_test)]).T,
            #                                          pd.get_dummies(data_test['country_brand'],prefix = 'country_brand').values)))

            #             grid_search = GridSearchCV(model1.model, param_grid, cv=5)
            model.fit(x_train, y_train)
            #             model = grid_search.best_estimator_
            print(model)
            preds = model.predict(x_test)

            preds = preds.clip(min=0)
            data_test['horizon'] = h
            data_test['prediction'] = preds
            res = pd.concat([
                res, data_test[[
                    "label", "date_to_predict", "country", "brand",
                    "country_brand", "prediction", "horizon"
                ]]
            ])
            feature_importance = dict(
                zip(
                    features,
                    zip(model.feature_importances_,
                        [h] * len(model.feature_importances_))))
            feature_importance = pd.DataFrame(feature_importance,
                                              index=['importance',
                                                     'horizon']).T
            feature_importance_df = feature_importance_df.append(
                feature_importance.reset_index(), ignore_index=True)
        feature_importance_df['date_when_preidct'] = datwep
        feature_importance = feature_importance_df
        feature_importance.to_csv(config["temp_folder_path"] +\
                                  '/' + str(datwep) + \
                                  '_feature_importance_RF.csv')

        res_, res__ = correct_fc_il(df,
                                    res,
                                    month_to_correct=[6, 'CNY', 11],
                                    thrsh=0.05)
        resfinal = pd.concat([resfinal, res_])
        resfinal = pd.concat([resfinal, res__])
    return resfinal, feature_importance_df
Exemple #3
0
    def forecast_since_date_at_horizon(self, date_start, horizon):
        """ Function that performs a full forecast since a date of sales
        :param date_start: last date of available sales in the data that need to be used by the model when forecasting
        :param horizon: horizon in the future for which we want a forecast
        :param params: parameters of the xgboost model
        :return: a dataframe containing the forecast
        """

        filter_date = min(date_start, self.max_date_available)
        dwps = create_list_period(201801, filter_date, False)
        dwp, dtp = get_all_combination_date(dwps, horizon)

        print("creating the main table")
        table_all_features = self.Create_feature(dwp, dtp)
        
        table_all_features['horizon'] =  (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month)

        # Calculate rolling features
        df_table_feature_rolling,roll_features_sel = self.get_rolling()
        # Merge main features with rolling features
        table_all_features = self.Merge_table_with_rolling(table_all_features,
                                                           df_table_feature_rolling,
                                                           roll_features_sel) 
        # Choose useful features
        features = [x for x in table_all_features.keys() if (x not in self.config["features_int"])&
                    (x in self.config["feature_sub_brand"])]
        res = pd.DataFrame()
        feature_importance_df = pd.DataFrame()

        for h in range(1, horizon + 1):
            print("training model at horizon: " + str(h))
            subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull())
                                         & (table_all_features.date_to_predict <= filter_date)]
        
            x_train = subdata[features].values
            y_train = subdata.target

            data_test = table_all_features[(table_all_features.date_when_predicting == filter_date) &
                                                   (table_all_features.horizon == h)].copy()

            x_test = data_test[features].values
            
            model = MLDCModel(
                model_name = self.config["model_config_XGBRegressor_sub_brand"].model_name,
                model_params = self.config["model_config_XGBRegressor_sub_brand"].model_params)
            
            model.fit(x_train, y_train)
            preds = model.predict(x_test)
            preds = preds.clip(min=0)
            data_test['horizon'] = h
            data_test['prediction'] = preds 
            res = pd.concat([res, data_test[["label","date_to_predict", "country","brand",
                                             "country_brand","prediction", "horizon"]]]) 
            # Create feature importance
            feature_importance = dict(zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_))))
            feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T
            feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True)

        self.feature_importance = feature_importance_df
        feature_importance_df['date_when_preidct'] = date_start
        feature_importance_df.to_csv(os.path.join(
            DIR_TEM, 'feature_importance_df_sets_sub_brand_' + str(date_start) + '.csv'))
        
        # Applying post-processing
        resfinal = self.correct_fc_il(res, month_to_correct=['CNY', 11], thrsh=0.05)
        resfinal["date_when_predicting"] = (
            pd.to_datetime(resfinal["date_to_predict"].astype(int).astype(str), format="%Y%m")
            - resfinal['horizon'].apply(pd.offsets.MonthBegin)
        ).apply(lambda x: x.strftime("%Y%m")).astype(int)
        
        # Output resutls
        res.to_csv((os.path.join(
            DIR_TEM, 'IL_sub_Brand_Forecst_result' + str(date_start) + '.csv')),index = False)
#         res.to_pickle(os.path.join(DIR_TEST_DATA, 'test_apply_forecast_correction_il.pkl'))
        return resfinal.reset_index(drop = True)
Exemple #4
0
    def forecast_since_date_at_horizon(self, date_start, horizon, model_config=None):
        """ Function that performs a full forecast since a date of sales

        :param date_start: last date of available sales in the data that need to be used by the model when forecasting
        :param horizon: horizon in the future for which we want a forecast
        :param model_config: instance of ModelConfig that contains information about the model that will be used
        :return: a dataframe containing the forecast
        """

        if model_config is None:
            print("model not specified - using default model")
            model_config = deepcopy(self.default_model_config)
            print(model_config)

        max_date_available = self.all_sales.calendar_yearmonth.max()
        filter_date = min(date_start, max_date_available)
        dwps = create_list_period(201701, filter_date, False)
        dwp, dtp = get_all_combination_date(dwps, horizon)

        print("creating the main table")
        table_all_features = self.create_all_features(dwp, dtp)
        table_all_features['horizon'] = \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month)
        features_int = ["date_when_predicting", "label", "date_to_predict", "sku", "target", "country", "brand",
                        "tier", "stage", "horizon"]
        features = [x for x in table_all_features.keys() if x not in features_int]
        res = pd.DataFrame()
        feature_importance_df = pd.DataFrame()

        for h in range(1, horizon + 1):
            print("training model at horizon: " + str(h))
            subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull())]
            x_train = subdata[features].values
            y_train = subdata.target
            data_test = table_all_features[(table_all_features.date_when_predicting == filter_date) &
                                           (table_all_features.horizon == h)].copy()

            x_test = data_test[features].values
            model = MLDCModel(
                model_name=model_config.model_name,
                model_params=model_config.model_params
            )

            model.fit(x_train, y_train)
            preds = model.predict(x_test)
            preds = preds.clip(min=0)

            data_test['prediction'] = preds
            data_test['horizon'] = h
            res = pd.concat([res, data_test[["label", "date_to_predict", "sku", "horizon", "prediction"]]])

            # Creating feature importance
            feature_importance = dict(zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_))))
            feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T
            feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True)

        self.feature_importance = feature_importance_df
        self.feature_importance.to_csv(str(date_start)+'_feature_importance.csv''')
        # Applying postprocessing
        res.to_pickle(os.path.join(DIR_TEST_DATA, 'test_apply_forecast_correction_dc.pkl'))
        res = self.correct_fc(res, month_to_correct=[7, 'CNY', 11])

        return res
Exemple #5
0
def model_single_run(config, dwp_test, table_all_features, df):

    resfinal = pd.DataFrame()
    feature_importance_df_final = pd.DataFrame()

    # Filter features to train the model
    features = [
        x for x in table_all_features.keys()
        if (x not in config["features_int"]) & (x in config["feature_test"])
    ]

    for datwep in dwp_test:
        print(datwep)

        res = pd.DataFrame()
        feature_importance_df = pd.DataFrame()
        for h in range(1, config["horizon"] + 1):
            print("training model at horizon: " + str(h))

            subdata = table_all_features[
                (table_all_features.horizon == h)
                & (~table_all_features.target.isnull())
                & (table_all_features.date_to_predict <= datwep)]
            if not config["FirstRun"]:
                feature_importance_df_sets = Load_50_feature(config)
                features = list(feature_importance_df_sets[str(h)])

            x_train = subdata[features].values
            y_train = subdata.target
            print(x_train.shape)

            data_test = table_all_features[
                (table_all_features.date_when_predicting == datwep)
                & (table_all_features.horizon == h)].reset_index().copy()

            x_test = data_test[features].values
            y_test = data_test.target

            #             params_grid={'max_depth': [10,6],
            #                          'learning_rate':[0.1,0.01],
            #                          'n_estimators':[50,40]}
            #             grid_search = GridSearchCV(estimator = lgb.LGBMRegressor(),
            #                                        param_grid = params_grid,
            #                                        cv=5,
            #                                        n_jobs=-1)
            #             grid_search.fit(x_train,y_train)
            #             model = grid_search.best_estimator_

            model = MLDCModel(
                model_name=config["model_config_XGBRegressor"].model_name,
                model_params=config["model_config_XGBRegressor"].model_params)
            model.fit(x_train, y_train)
            #             grid_search = GridSearchCV(model.model,
            #                                        param_grid = params_grid,
            #                                        cv=5,
            #                                        n_jobs=-1)
            #             grid_search.fit(x_train,y_train)
            #             model = grid_search.best_estimator_
            train_preds = model.predict(x_train)
            print("======================================================")
            print("Train RMSE : ",
                  np.sqrt(mean_squared_error(y_train, train_preds)))

            preds = model.predict(x_test)

            # 根均方误差(RMSE)
            if data_test['date_to_predict'][0] <= 201912:
                print("======================================================")
                print("Test RMSE : ",
                      np.sqrt(mean_squared_error(y_test, preds)))
            preds = preds.clip(min=0)
            data_test['horizon'] = h
            data_test['prediction'] = preds
            res = pd.concat([
                res, data_test[[
                    "label", "date_to_predict", "country", 'brand', "tier",
                    "country_brand", "sub_brand", "prediction", "horizon"
                ]]
            ])
            feature_importance = dict(
                zip(
                    features,
                    zip(model.feature_importances_,
                        [h] * len(model.feature_importances_))))
            feature_importance = pd.DataFrame(feature_importance,
                                              index=['importance',
                                                     'horizon']).T
            feature_importance_df = feature_importance_df.append(
                feature_importance.reset_index(), ignore_index=True)
        feature_importance_df['date_when_preidct'] = datwep
        feature_importance_df['date_when_preidct'] = datwep
        feature_importance_df.to_csv(config["project_folder_path"] + '/' + \
                                     config["temp_folder_path"] +'/' +\
                                     str(datwep) + '_feature_importance_with_Q2.csv')
        res_ = correct_fc_il(df_,
                             res,
                             month_to_correct=[6, 'CNY', 11],
                             thrsh=0.05)
        resfinal = pd.concat([resfinal, res_])

    return resfinal, feature_importance_df
Exemple #6
0
    def recreate_past_forecasts(self, table_all_features, list_dwps, horizon=10, model_config=None):

        table_all_features['horizon'] = \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \
            (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month -
             pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month)
        features_int = ["date_when_predicting", "label", "date_to_predict", "sku_wo_pkg", "target", "country", "brand",
                        "tier", "stage", "horizon", "sku_w_pkg"]
        features = [x for x in table_all_features.keys() if x not in features_int]

        table_all_features = table_all_features[~table_all_features.forecast_eln.isnull()]

        # 3. Prediction at sku granularity di and ieb
        # print("Training SKU model for IEB and DI")
        resfinal = pd.DataFrame()
        for datwep in list_dwps:
            print(f".. date when predicting {datwep}")
            res = pd.DataFrame()
            for h in range(1, horizon + 1):
            #for h in range(5, 8):
                print(f".... horizon {h}")
                subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull())
                                             & (table_all_features.date_to_predict <= datwep)]
                if subdata.shape[0] < 1:
                    continue
                x_train = subdata[features].values
                y_train = subdata.target

                data_test_di = table_all_features[(table_all_features.date_when_predicting == datwep) &
                                                   (table_all_features.horizon == h) &
                                                   (table_all_features.label == 'di')]

                x_test_di = data_test_di[features].values

                if not self.debug:

                    # PARAMS = params_dict_di[h]
                    # Instantiate model
                    # model = xgb.XGBRegressor(**PARAMS)
                    # model = ExtraTreesRegressor(**PARAMS)
                    # where_are_NaNs = np.isnan(x_train)
                    # x_train[where_are_NaNs] = 0
                    model = MLDCModel(
                        model_name=model_config.model_name,
                        model_params=model_config.model_params
                    )
                    model.fit(x_train, y_train)

                    # where_are_NaNs = np.isnan(x_test_di)
                    # x_test_di[where_are_NaNs] = 0
                    preds_di = model.predict(x_test_di)

                else:
                    # dummy prediction
                    preds_di = np.ones(len(x_test_di)) * 3000

                preds_di = preds_di.clip(0)

                data_test_di = data_test_di.assign(
                    horizon=h,
                    prediction=preds_di
                )
                data_test = data_test_di
                # data_test['prediction'] = data_test['forecast']
                res = pd.concat([res, data_test[["label", "date_to_predict", "sku_wo_pkg", "sku_w_pkg", "prediction", "date_when_predicting", "horizon"]]])

            # rescale il di eib
            #res = self.correct_fc(res, month_to_correct=['CNY', 11], thrsh=0.05)

            resfinal = pd.concat([resfinal, res])

        return resfinal