class Forecasting: """Forecast and perform model selection based on historical forecast Init Parameters ---------- platform : {'local', 'gcp'} platform to store input/output logtag : str logging tag tz : str (e.g. Asia/Bangkok) timezone for logging cloud_auth : str authentication file path """ def __init__(self, platform, logtag, tz, cloud_auth=None): self.fp = FilePath(platform, cloud_auth) self.lg = Logging(platform, "forecast", logtag, cloud_auth) self.lg.logtxt("[START FORECASTING]") self.tz = tz def loaddata(self, act_path, fcstlog_path, ext_path=None, extlag_path=None): """Load data for validation process Parameters ---------- act_path : str historical data path fcstlog_path : str forecast log path ext_path : str external features path extlag_path : str external lag path """ # load actual and forecast data col_id, col_ds, col_y, col_mth, col_model, col_fcst = 'id', 'ds', 'y', 'mth', 'model', 'forecast' dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date() df_act = pd.read_csv(self.fp.loadfile(act_path), parse_dates=['ds'], date_parser=dateparse) df_fcstlog = pd.read_csv(self.fp.loadfile(fcstlog_path), parse_dates=['ds', 'dsr'], date_parser=dateparse) self.df_act = df_act.rename(columns={ col_id: 'id', col_ds: 'ds', col_y: 'y' }) self.df_fcstlog = df_fcstlog.rename( columns={ col_id: 'id', col_ds: 'ds', col_mth: 'mth', col_model: 'model', col_fcst: 'forecast' }) # load external features if ext_path is not None: col_yid, col_extid, col_lag = 'y_id', 'ext_id', 'lag' ext = pd.read_csv(self.fp.loadfile(ext_path), parse_dates=['ds'], date_parser=dateparse) ext_lag = pd.read_csv(self.fp.loadfile(extlag_path), date_parser=dateparse) self.ext = ext.rename(columns={ col_id: 'id', col_ds: 'ds', col_y: 'y' })[['id', 'ds', 'y']] self.ext_lag = ext_lag.rename(columns={ col_yid: 'y_id', col_extid: 'ext_id', col_lag: 'lag' })[['y_id', 'ext_id', 'lag']] self.lg.logtxt("load data: {} | {} | {} | {}".format( act_path, fcstlog_path, ext_path, extlag_path)) else: self.ext = None self.ext_lag = None self.lg.logtxt("load data: {} | {}".format(act_path, fcstlog_path)) def forecast_byitem(self, x, act_st, fcst_st, fcst_pr, model_list, pr_st, batch_no): """Forecast data by item for parallel computing""" df = self.df_act[self.df_act['id'] == x].copy() if self.ext is not None: ext = self.ext[['id', 'ds', 'y']].copy() ext_lag = self.ext_lag[self.ext_lag['y_id'] == x].rename( columns={'ext_id': 'id'})[['id', 'lag']].copy() else: ext = None ext_lag = None model = TimeSeriesForecasting(df=df, act_st=act_st, fcst_st=fcst_st, fcst_pr=fcst_pr, ext=ext, ext_lag=ext_lag) df_r = pd.DataFrame() for m in model_list: try: runitem = {"batch": batch_no, "id": x, "model": m} st_time = datetime.datetime.now() r = model.forecast(m) r = r.rename(columns={'y': 'forecast'}) r['time'] = (datetime.datetime.now() - st_time).total_seconds() r['id'] = x r['dsr'] = fcst_st r['model'] = m r['period'] = np.arange(pr_st, len(r) + pr_st) r = r[[ 'id', 'ds', 'dsr', 'period', 'model', 'forecast', 'time' ]] df_r = df_r.append(r, ignore_index=True) except Exception as e: error_item = "batch: {} | id: {} | model:{}".format( runitem.get('batch'), runitem.get('id'), runitem.get('model')) error_txt = "ERROR: {} ({})".format(str(e), error_item) self.lg.logtxt(error_txt, error=True) return df_r def rank_model(self, fcst_model, act_st, fcst_st, test_type, test_st, rank_by='mae', error_by='mape'): """Rank model based on historical forecast""" df_act = pd.DataFrame() for i in self.df_act['id'].unique(): df_i = self.df_act[self.df_act['id'] == i].copy() df_i = TimeSeriesForecasting.filldaily( df_i, act_st, fcst_st + datetime.timedelta(days=-1)) df_i = df_i if test_type == 'daily' else TimeSeriesForecasting.daytomth( df_i) df_i['id'] = i df_act = df_act.append(df_i[['id', 'ds', 'y']], ignore_index=True) df_rank = self.df_fcstlog[(self.df_fcstlog['dsr'] >= test_st) & (self.df_fcstlog['dsr'] < fcst_st)].copy() # select only in config file df_rank['val'] = df_rank['period'].map(fcst_model) df_rank = df_rank[df_rank['val'].notnull()].copy() df_rank['val'] = df_rank.apply(lambda x: True if x['model'] in x['val'] else False, axis=1) df_rank = df_rank[df_rank['val'] == True].copy() # # calculate error comparing with actual df_rank = pd.merge(df_rank, df_act.rename(columns={'y': 'actual'}), on=['id', 'ds'], how='left') df_rank['mae'] = df_rank.apply( lambda x: abs(x['actual'] - x['forecast']), axis=1) df_rank['mape'] = df_rank.apply( lambda x: mape(x['actual'], x['forecast']), axis=1) df_rank[['mae', 'mape']] = df_rank[['mae', 'mape']].fillna(0) # ranking error df_rank = df_rank.groupby(['id', 'period', 'model'], as_index=False).agg({ 'mae': 'mean', 'mape': 'mean' }) df_rank['rank'] = df_rank.groupby(['id', 'period' ])[rank_by].rank(method='dense', ascending=True) df_rank['error'] = df_rank[error_by] return df_rank def ensemble_model(self, df_fcst, df_rank, top_model, method): # combine forecast df_ens = pd.merge(df_fcst, df_rank, on=['id', 'period', 'model'], how='left') df_ens = df_ens[df_ens['rank'] <= top_model].copy() if method == 'mean': df_ens = df_ens.groupby(['id', 'ds', 'dsr', 'period'], as_index=False).agg({ 'forecast': 'mean', 'error': 'mean' }) elif method == 'median': df_ens = df_ens.groupby(['id', 'ds', 'dsr', 'period'], as_index=False).agg({ 'forecast': 'median', 'error': 'median' }) df_ens = df_ens.sort_values(by=['id', 'dsr', 'ds'], ascending=True).reset_index(drop=True) return df_ens def forecast(self, output_dir, act_st, fcst_st, fcst_model, test_type, test_bck, top_model=3, ens_method='mean', chunk_sz=1, cpu=1): """Forecast and write result by batch Parameters ---------- output_dir : str output directory act_st : datetime actual start date fcst_st : datetime forecast date fcst_model : dict('period', [list of models]) forecast model options for each periods test_type : {'monthly', 'daily} type of testing back error by month or day test_bck : int number of months to test back chunk_sz : int number of item to validate for each chunk cpu : int number of running processors """ # make output directory output_dir = "{}forecast_{}/".format( output_dir, datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S")) self.output_dir = output_dir self.fp.mkdir(output_dir) self.lg.logtxt("create output directory: {}".format(output_dir)) self.fp.writecsv(self.df_act, "{}input_actual.csv".format(output_dir)) self.fp.writecsv(self.df_fcstlog, "{}input_forecast.csv".format(output_dir)) # write external features if self.ext is not None: self.fp.writecsv(self.ext, "{}input_external.csv".format(output_dir)) self.fp.writecsv(self.ext_lag, "{}input_externallag.csv".format(output_dir)) self.lg.logtxt( "write input file: {}input_actual.csv | {}input_forecast.csv | {}input_external.csv | {}input_externallag.csv" .format(output_dir, output_dir, output_dir, output_dir)) else: self.lg.logtxt( "write input file: {}input_actual.csv | {}input_forecast.csv". format(output_dir, output_dir)) self.runitem = {} # set parameter items = self.df_act['id'].unique() n_chunk = len([x for x in chunker(items, chunk_sz)]) act_st = datetime.datetime.combine(act_st, datetime.datetime.min.time()) fcst_st = datetime.datetime.combine(fcst_st, datetime.datetime.min.time()) test_st = fcst_st + relativedelta(months=-test_bck) fcst_pr = len(fcst_model.keys()) pr_st = min(fcst_model.keys()) model_list = list(set(b for a in fcst_model.values() for b in a)) self.lg.logtxt( "total items: {} | chunk size: {} | total chunk: {}".format( len(items), chunk_sz, n_chunk)) # rank the models df_rank = self.rank_model(fcst_model, act_st, fcst_st, test_type, test_st) # forecast cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count( ) if cpu >= multiprocessing.cpu_count() else cpu self.lg.logtxt("run at {} processor(s)".format(cpu_count)) for i, c in enumerate(chunker(items, chunk_sz), 1): df_fcst = pd.DataFrame() if cpu_count == 1: for r in [ self.forecast_byitem(x, act_st, fcst_st, fcst_pr, model_list, pr_st, i) for x in c ]: df_fcst = df_fcst.append(r, ignore_index=True) else: pool = multiprocessing.Pool(processes=cpu_count) for r in pool.starmap( self.forecast_byitem, [[x, act_st, fcst_st, fcst_pr, model_list, pr_st, i] for x in c]): df_fcst = df_fcst.append(r, ignore_index=True) pool.close() pool.join() # ensemble forecast results df_ens = self.ensemble_model(df_fcst, df_rank, top_model, method=ens_method) # write forecast result fcst_path = "{}output_forecast_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_ens, fcst_path) # write forecast log result fcstlog_path = "{}output_forecastlog_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_fcst, fcstlog_path) self.lg.logtxt("write output file ({}/{}): {} | {}".format( i, n_chunk, fcst_path, fcstlog_path)) self.lg.logtxt("[END FORECAST]") self.lg.writelog("{}logfile.log".format(output_dir))
class Validation: """Validate forecast model by rolling forecast Init Parameters ---------- platform : {'local', 'gcp'} platform to store input/output tz : str (e.g. Asia/Bangkok) timezone for logging logtag : str logging tag cloud_auth : str authentication file path """ def __init__(self, platform, logtag, tz, cloud_auth=None): self.fp = FilePath(platform, cloud_auth) self.lg = Logging(platform, "validate", logtag, cloud_auth) self.lg.logtxt("[START VALIDATION]") self.tz = tz def loaddata(self, act_path, ext_path=None, extlag_path=None): """Load data for validation process Parameters ---------- act_path : str historical data path ext_path : str external features path extlag_path : str external lag path """ col_id, col_ds, col_y = 'id', 'ds', 'y' dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date() # load sales data df = pd.read_csv(self.fp.loadfile(act_path), parse_dates=['ds'], date_parser=dateparse) self.df = df.rename(columns={ col_id: 'id', col_ds: 'ds', col_y: 'y' })[['id', 'ds', 'y']] # load external features if ext_path is not None: col_yid, col_extid, col_lag = 'y_id', 'ext_id', 'lag' ext = pd.read_csv(self.fp.loadfile(ext_path), parse_dates=['ds'], date_parser=dateparse) ext_lag = pd.read_csv(self.fp.loadfile(extlag_path), date_parser=dateparse) self.ext = ext.rename(columns={ col_id: 'id', col_ds: 'ds', col_y: 'y' })[['id', 'ds', 'y']] self.ext_lag = ext_lag.rename(columns={ col_yid: 'y_id', col_extid: 'ext_id', col_lag: 'lag' })[['y_id', 'ext_id', 'lag']] self.lg.logtxt("load data: {} | {} | {}".format( act_path, ext_path, extlag_path)) else: self.ext = None self.ext_lag = None self.lg.logtxt("load data: {}".format(act_path)) def validate_byitem(self, x, act_st, test_date, test_model, fcst_pr, pr_st, batch_no): """Validate data by item for parallel computing""" df = self.df[self.df['id'] == x][['ds', 'y']].copy() if self.ext is not None: ext = self.ext[['id', 'ds', 'y']].copy() ext_lag = self.ext_lag[self.ext_lag['y_id'] == x].rename( columns={'ext_id': 'id'})[['id', 'lag']].copy() else: ext = None ext_lag = None df_r = pd.DataFrame() for d in test_date: model = TimeSeriesForecasting(df=df, act_st=act_st, fcst_st=d, fcst_pr=fcst_pr, ext=ext, ext_lag=ext_lag) for m in test_model: runitem = { "batch": batch_no, "id": x, "testdate": d, "model": m } try: st_time = datetime.datetime.now() r = model.forecast(m) r = r.rename(columns={'y': 'forecast'}) r['time'] = (datetime.datetime.now() - st_time).total_seconds() r['id'] = x r['dsr'] = d r['period'] = np.arange(pr_st, len(r) + pr_st) r['model'] = m r = r[[ 'id', 'ds', 'dsr', 'period', 'model', 'forecast', 'time' ]] df_r = df_r.append(r, ignore_index=True) except Exception as e: error_item = "batch: {} | id: {} | testdate: {} | model:{}".format( runitem.get('batch'), runitem.get('id'), runitem.get('testdate').strftime("%Y-%m-%d"), runitem.get('model')) error_txt = "ERROR: {} ({})".format(str(e), error_item) self.lg.logtxt(error_txt, error=True) return df_r def validate(self, output_dir, act_st, test_st, test_pr, test_model, fcst_pr, pr_st, chunk_sz, cpu): """Validate forecast model and write result by batch Parameters ---------- output_dir : str output directory act_st : datetime actual start date test_st : datetime test start date test_pr : int number of rolling period to test (months) test_model : list list of model to test fcst_pr : int number of periods to forecast for each rolling pr_st : int starting period for each forecast (default 0/1) chunk_sz : int number of item to validate for each chunk cpu : int number of running processors """ # make output directory output_dir = "{}validate_{}/".format( output_dir, datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S")) self.output_dir = output_dir self.fp.mkdir(output_dir) self.lg.logtxt("create output directory: {}".format(output_dir)) self.fp.writecsv(self.df, "{}input_actual.csv".format(output_dir)) # write external features if self.ext is not None: self.fp.writecsv(self.ext, "{}input_external.csv".format(output_dir)) self.fp.writecsv(self.ext_lag, "{}input_externallag.csv".format(output_dir)) self.lg.logtxt( "write input file: {}input_actual.csv | {}input_external.csv | {}input_externallag.csv" .format(output_dir, output_dir, output_dir)) else: self.lg.logtxt( "write input file: {}input_actual.csv".format(output_dir)) # set parameter items = self.df['id'].unique() n_chunk = len([x for x in chunker(items, chunk_sz)]) test_date = [ x.to_pydatetime() + datetime.timedelta(days=+test_st.day - 1) for x in pd.date_range(start=test_st, periods=test_pr, freq='MS') ] self.lg.logtxt( "total items: {} | chunk size: {} | total chunk: {}".format( len(items), chunk_sz, n_chunk)) # loop by chunk cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count( ) if cpu >= multiprocessing.cpu_count() else cpu self.lg.logtxt("run at {} processor(s)".format(cpu_count)) for i, c in enumerate(chunker(items, chunk_sz), 1): df_fcst = pd.DataFrame() if cpu_count == 1: for r in [ self.validate_byitem(x, act_st, test_date, test_model, fcst_pr, pr_st, i) for x in c ]: df_fcst = df_fcst.append(r, ignore_index=True) else: pool = multiprocessing.Pool(processes=cpu_count) for r in pool.starmap( self.validate_byitem, [[x, act_st, test_date, test_model, fcst_pr, pr_st, i] for x in c]): df_fcst = df_fcst.append(r, ignore_index=True) pool.close() pool.join() # write csv file output_path = "{}output_validate_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_fcst, output_path) self.lg.logtxt("write output file ({}/{}): {}".format( i, n_chunk, output_path)) self.lg.logtxt("[END VALIDATION]") self.lg.writelog("{}logfile.log".format(output_dir))