def recreate_past_forecasts(self, table_all_features, list_dwps, horizon=10, model_config=None): if model_config is None: print("model not specified - using default model") model_config = deepcopy(self.default_model_config) print(model_config) table_all_features['horizon'] = \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month) features_int = ["date_when_predicting", "label", "date_to_predict", "sku", "target", "country", "brand", "tier", "stage", "horizon"] features = [x for x in table_all_features.keys() if x not in features_int] resfinal = pd.DataFrame() feature_importance_df_final = pd.DataFrame() for datwep in list_dwps: print("date when predicting: " + str(datwep)) res = pd.DataFrame() feature_importance_df = pd.DataFrame() for h in range(1, horizon + 1): print("training model at horizon: " + str(h)) subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull()) & (table_all_features.date_to_predict <= datwep)] x_train = subdata[features].values y_train = subdata.target data_test = table_all_features[(table_all_features.date_when_predicting == datwep) & (table_all_features.horizon == h)].copy() x_test = data_test[features].values model = MLDCModel( model_name=model_config.model_name, model_params=model_config.model_params ) model.fit(x_train, y_train) preds = model.predict(x_test) preds = preds.clip(min=0) data_test['horizon'] = h data_test['prediction'] = preds res = pd.concat([res, data_test[["label", "date_to_predict", "sku", "prediction", "horizon"]]]) feature_importance = dict( zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_)))) feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True) feature_importance_df['date_when_preidct'] = datwep self.feature_importance = feature_importance_df self.feature_importance.to_csv(str(datwep) + '_feature_importance.csv''') res = self.correct_fc(res, month_to_correct=[7, 'CNY', 11], thrsh=0.05) resfinal = pd.concat([resfinal, res]) feature_importance_df_final = pd.concat([feature_importance_df_final,feature_importance_df]) feature_importance_df_final.to_csv('./data/feature_importance_all_df.csv') return resfinal
def Generate_Forecast_IL(config, dwp_test, table_all_features, df): resfinal = pd.DataFrame() feature_importance_df_final = pd.DataFrame() # Filter features to train the model features = [ x for x in table_all_features.keys() if x not in config["features_int"] ] features_cat = [ c for c in table_all_features.keys() if ('spre' in c) | ('upre' in c) | ('mainstream' in c) ] for datwep in dwp_test: print(datwep) res = pd.DataFrame() feature_importance_df = pd.DataFrame() for h in range(1, config["horizon"] + 1): print("training model at horizon: " + str(h)) subdata = table_all_features[ (table_all_features.horizon == h) & (~table_all_features.target.isnull()) & (table_all_features.date_to_predict <= datwep)] if not config["FirstRun"]: feature_importance_df_sets = Load_50_feature(config) features = list(feature_importance_df_sets[str(h)]) +\ features_cat +\ config["features_cat_col"] +\ config["features_cat_fsct_col"] x_train = subdata[features].values y_train = subdata.target data_test = table_all_features[ (table_all_features.date_when_predicting == datwep) & (table_all_features.horizon == h)].copy() x_test = data_test[features].values # model1 = MLDCModel( # model_name = config["model_config_XGBRegressor"].model_name, # model_params = config["model_config_XGBRegressor"].model_params) model = MLDCModel( model_name=config["model_config_RandomForestRegressor"]. model_name, model_params=config["model_config_RandomForestRegressor"]. model_params) # model3 = MLDCModel( # model_name = config["model_config_GradientBoostingRegressor"].model_name, # model_params = config["model_config_GradientBoostingRegressor"].model_params) # model4 = MLDCModel( # model_name = config["model_config_AdaBoostRegressor"].model_name, # model_params = config["model_config_AdaBoostRegressor"].model_params) # model1.fit(x_train, y_train) # model2.fit(x_train, y_train) # model3.fit(x_train, y_train) # model4.fit(x_train, y_train) # preds1_train = model1.predict(x_train) # preds2_train = model2.predict(x_train) # preds3_train = model3.predict(x_train) # preds4_train = model4.predict(x_train) # preds1_test = model1.predict(x_test) # preds2_test = model2.predict(x_test) # preds3_test = model3.predict(x_test) # preds4_test = model4.predict(x_test) # param_grid = [{ # 'max_depth': [8], # 'n_estimators': [80]}] # KNN = KNeighborsRegressor() # grid_search = GridSearchCV(KNN, param_grid, cv=5) # grid_search.fit(np.column_stack((np.array([list(preds1_train),list(preds2_train),list(preds3_train),list(preds4_train)]).T, # pd.get_dummies(subdata['country_brand'],prefix = 'country_brand').values)), y_train) # KNN_blending = grid_search.best_estimator_ # preds = KNN_blending.predict(np.column_stack((np.array([list(preds1_test),list(preds2_test),list(preds3_test),list(preds4_test)]).T, # pd.get_dummies(data_test['country_brand'],prefix = 'country_brand').values))) # grid_search = GridSearchCV(model1.model, param_grid, cv=5) model.fit(x_train, y_train) # model = grid_search.best_estimator_ print(model) preds = model.predict(x_test) preds = preds.clip(min=0) data_test['horizon'] = h data_test['prediction'] = preds res = pd.concat([ res, data_test[[ "label", "date_to_predict", "country", "brand", "country_brand", "prediction", "horizon" ]] ]) feature_importance = dict( zip( features, zip(model.feature_importances_, [h] * len(model.feature_importances_)))) feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T feature_importance_df = feature_importance_df.append( feature_importance.reset_index(), ignore_index=True) feature_importance_df['date_when_preidct'] = datwep feature_importance = feature_importance_df feature_importance.to_csv(config["temp_folder_path"] +\ '/' + str(datwep) + \ '_feature_importance_RF.csv') res_, res__ = correct_fc_il(df, res, month_to_correct=[6, 'CNY', 11], thrsh=0.05) resfinal = pd.concat([resfinal, res_]) resfinal = pd.concat([resfinal, res__]) return resfinal, feature_importance_df
def forecast_since_date_at_horizon(self, date_start, horizon): """ Function that performs a full forecast since a date of sales :param date_start: last date of available sales in the data that need to be used by the model when forecasting :param horizon: horizon in the future for which we want a forecast :param params: parameters of the xgboost model :return: a dataframe containing the forecast """ filter_date = min(date_start, self.max_date_available) dwps = create_list_period(201801, filter_date, False) dwp, dtp = get_all_combination_date(dwps, horizon) print("creating the main table") table_all_features = self.Create_feature(dwp, dtp) table_all_features['horizon'] = (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month) # Calculate rolling features df_table_feature_rolling,roll_features_sel = self.get_rolling() # Merge main features with rolling features table_all_features = self.Merge_table_with_rolling(table_all_features, df_table_feature_rolling, roll_features_sel) # Choose useful features features = [x for x in table_all_features.keys() if (x not in self.config["features_int"])& (x in self.config["feature_sub_brand"])] res = pd.DataFrame() feature_importance_df = pd.DataFrame() for h in range(1, horizon + 1): print("training model at horizon: " + str(h)) subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull()) & (table_all_features.date_to_predict <= filter_date)] x_train = subdata[features].values y_train = subdata.target data_test = table_all_features[(table_all_features.date_when_predicting == filter_date) & (table_all_features.horizon == h)].copy() x_test = data_test[features].values model = MLDCModel( model_name = self.config["model_config_XGBRegressor_sub_brand"].model_name, model_params = self.config["model_config_XGBRegressor_sub_brand"].model_params) model.fit(x_train, y_train) preds = model.predict(x_test) preds = preds.clip(min=0) data_test['horizon'] = h data_test['prediction'] = preds res = pd.concat([res, data_test[["label","date_to_predict", "country","brand", "country_brand","prediction", "horizon"]]]) # Create feature importance feature_importance = dict(zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_)))) feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True) self.feature_importance = feature_importance_df feature_importance_df['date_when_preidct'] = date_start feature_importance_df.to_csv(os.path.join( DIR_TEM, 'feature_importance_df_sets_sub_brand_' + str(date_start) + '.csv')) # Applying post-processing resfinal = self.correct_fc_il(res, month_to_correct=['CNY', 11], thrsh=0.05) resfinal["date_when_predicting"] = ( pd.to_datetime(resfinal["date_to_predict"].astype(int).astype(str), format="%Y%m") - resfinal['horizon'].apply(pd.offsets.MonthBegin) ).apply(lambda x: x.strftime("%Y%m")).astype(int) # Output resutls res.to_csv((os.path.join( DIR_TEM, 'IL_sub_Brand_Forecst_result' + str(date_start) + '.csv')),index = False) # res.to_pickle(os.path.join(DIR_TEST_DATA, 'test_apply_forecast_correction_il.pkl')) return resfinal.reset_index(drop = True)
def forecast_since_date_at_horizon(self, date_start, horizon, model_config=None): """ Function that performs a full forecast since a date of sales :param date_start: last date of available sales in the data that need to be used by the model when forecasting :param horizon: horizon in the future for which we want a forecast :param model_config: instance of ModelConfig that contains information about the model that will be used :return: a dataframe containing the forecast """ if model_config is None: print("model not specified - using default model") model_config = deepcopy(self.default_model_config) print(model_config) max_date_available = self.all_sales.calendar_yearmonth.max() filter_date = min(date_start, max_date_available) dwps = create_list_period(201701, filter_date, False) dwp, dtp = get_all_combination_date(dwps, horizon) print("creating the main table") table_all_features = self.create_all_features(dwp, dtp) table_all_features['horizon'] = \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month) features_int = ["date_when_predicting", "label", "date_to_predict", "sku", "target", "country", "brand", "tier", "stage", "horizon"] features = [x for x in table_all_features.keys() if x not in features_int] res = pd.DataFrame() feature_importance_df = pd.DataFrame() for h in range(1, horizon + 1): print("training model at horizon: " + str(h)) subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull())] x_train = subdata[features].values y_train = subdata.target data_test = table_all_features[(table_all_features.date_when_predicting == filter_date) & (table_all_features.horizon == h)].copy() x_test = data_test[features].values model = MLDCModel( model_name=model_config.model_name, model_params=model_config.model_params ) model.fit(x_train, y_train) preds = model.predict(x_test) preds = preds.clip(min=0) data_test['prediction'] = preds data_test['horizon'] = h res = pd.concat([res, data_test[["label", "date_to_predict", "sku", "horizon", "prediction"]]]) # Creating feature importance feature_importance = dict(zip(features, zip(model.feature_importances_, [h] * len(model.feature_importances_)))) feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T feature_importance_df = feature_importance_df.append(feature_importance.reset_index(), ignore_index=True) self.feature_importance = feature_importance_df self.feature_importance.to_csv(str(date_start)+'_feature_importance.csv''') # Applying postprocessing res.to_pickle(os.path.join(DIR_TEST_DATA, 'test_apply_forecast_correction_dc.pkl')) res = self.correct_fc(res, month_to_correct=[7, 'CNY', 11]) return res
def model_single_run(config, dwp_test, table_all_features, df): resfinal = pd.DataFrame() feature_importance_df_final = pd.DataFrame() # Filter features to train the model features = [ x for x in table_all_features.keys() if (x not in config["features_int"]) & (x in config["feature_test"]) ] for datwep in dwp_test: print(datwep) res = pd.DataFrame() feature_importance_df = pd.DataFrame() for h in range(1, config["horizon"] + 1): print("training model at horizon: " + str(h)) subdata = table_all_features[ (table_all_features.horizon == h) & (~table_all_features.target.isnull()) & (table_all_features.date_to_predict <= datwep)] if not config["FirstRun"]: feature_importance_df_sets = Load_50_feature(config) features = list(feature_importance_df_sets[str(h)]) x_train = subdata[features].values y_train = subdata.target print(x_train.shape) data_test = table_all_features[ (table_all_features.date_when_predicting == datwep) & (table_all_features.horizon == h)].reset_index().copy() x_test = data_test[features].values y_test = data_test.target # params_grid={'max_depth': [10,6], # 'learning_rate':[0.1,0.01], # 'n_estimators':[50,40]} # grid_search = GridSearchCV(estimator = lgb.LGBMRegressor(), # param_grid = params_grid, # cv=5, # n_jobs=-1) # grid_search.fit(x_train,y_train) # model = grid_search.best_estimator_ model = MLDCModel( model_name=config["model_config_XGBRegressor"].model_name, model_params=config["model_config_XGBRegressor"].model_params) model.fit(x_train, y_train) # grid_search = GridSearchCV(model.model, # param_grid = params_grid, # cv=5, # n_jobs=-1) # grid_search.fit(x_train,y_train) # model = grid_search.best_estimator_ train_preds = model.predict(x_train) print("======================================================") print("Train RMSE : ", np.sqrt(mean_squared_error(y_train, train_preds))) preds = model.predict(x_test) # 根均方误差(RMSE) if data_test['date_to_predict'][0] <= 201912: print("======================================================") print("Test RMSE : ", np.sqrt(mean_squared_error(y_test, preds))) preds = preds.clip(min=0) data_test['horizon'] = h data_test['prediction'] = preds res = pd.concat([ res, data_test[[ "label", "date_to_predict", "country", 'brand', "tier", "country_brand", "sub_brand", "prediction", "horizon" ]] ]) feature_importance = dict( zip( features, zip(model.feature_importances_, [h] * len(model.feature_importances_)))) feature_importance = pd.DataFrame(feature_importance, index=['importance', 'horizon']).T feature_importance_df = feature_importance_df.append( feature_importance.reset_index(), ignore_index=True) feature_importance_df['date_when_preidct'] = datwep feature_importance_df['date_when_preidct'] = datwep feature_importance_df.to_csv(config["project_folder_path"] + '/' + \ config["temp_folder_path"] +'/' +\ str(datwep) + '_feature_importance_with_Q2.csv') res_ = correct_fc_il(df_, res, month_to_correct=[6, 'CNY', 11], thrsh=0.05) resfinal = pd.concat([resfinal, res_]) return resfinal, feature_importance_df
def recreate_past_forecasts(self, table_all_features, list_dwps, horizon=10, model_config=None): table_all_features['horizon'] = \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.year - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.year) * 12 + \ (pd.to_datetime(table_all_features.date_to_predict, format='%Y%m').dt.month - pd.to_datetime(table_all_features.date_when_predicting, format='%Y%m').dt.month) features_int = ["date_when_predicting", "label", "date_to_predict", "sku_wo_pkg", "target", "country", "brand", "tier", "stage", "horizon", "sku_w_pkg"] features = [x for x in table_all_features.keys() if x not in features_int] table_all_features = table_all_features[~table_all_features.forecast_eln.isnull()] # 3. Prediction at sku granularity di and ieb # print("Training SKU model for IEB and DI") resfinal = pd.DataFrame() for datwep in list_dwps: print(f".. date when predicting {datwep}") res = pd.DataFrame() for h in range(1, horizon + 1): #for h in range(5, 8): print(f".... horizon {h}") subdata = table_all_features[(table_all_features.horizon == h) & (~table_all_features.target.isnull()) & (table_all_features.date_to_predict <= datwep)] if subdata.shape[0] < 1: continue x_train = subdata[features].values y_train = subdata.target data_test_di = table_all_features[(table_all_features.date_when_predicting == datwep) & (table_all_features.horizon == h) & (table_all_features.label == 'di')] x_test_di = data_test_di[features].values if not self.debug: # PARAMS = params_dict_di[h] # Instantiate model # model = xgb.XGBRegressor(**PARAMS) # model = ExtraTreesRegressor(**PARAMS) # where_are_NaNs = np.isnan(x_train) # x_train[where_are_NaNs] = 0 model = MLDCModel( model_name=model_config.model_name, model_params=model_config.model_params ) model.fit(x_train, y_train) # where_are_NaNs = np.isnan(x_test_di) # x_test_di[where_are_NaNs] = 0 preds_di = model.predict(x_test_di) else: # dummy prediction preds_di = np.ones(len(x_test_di)) * 3000 preds_di = preds_di.clip(0) data_test_di = data_test_di.assign( horizon=h, prediction=preds_di ) data_test = data_test_di # data_test['prediction'] = data_test['forecast'] res = pd.concat([res, data_test[["label", "date_to_predict", "sku_wo_pkg", "sku_w_pkg", "prediction", "date_when_predicting", "horizon"]]]) # rescale il di eib #res = self.correct_fc(res, month_to_correct=['CNY', 11], thrsh=0.05) resfinal = pd.concat([resfinal, res]) return resfinal