def main(cutoff_date): s_ut.my_print('loading to sup.dim_cx_ticket_forecast the forecast with cutoff date ' + str(cutoff_date)) t_file = os.path.expanduser('~/Forecasts/par/' + 'table_output_' + str(cutoff_date) + '.par') s_ut.my_print('table file: ' + str(t_file)) if os.path.isfile(t_file): df = p_ut.read_df(t_file) p_ut.set_week_start(df, tcol='ds') # week_starting patch # week_starting patch df_cols_ = df.columns if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_: df['ds_week_ending'] = pd.to_datetime(df['ds_week_ending']) df['ds_week_starting'] = df['ds_week_ending'] - pd.to_timedelta(6, unit='D') s_ut.my_print('data file: ' + str(t_file) + ' rows: ' + str(len(df)) + ' to table') partition = {'ds': str(cutoff_date)} table = 'sup.dim_cx_ticket_forecast' ap.hive.push(df, table=table, if_exists='replace', partition=partition, table_props={'abb_retention_days': '-1', 'abb_retention_days_reason': 'fact table. No pii'} ) return 0 else: s_ut.my_print('ERROR: failed to load: file ' + t_file + ' is missing') return -1
def table_load(dr_, cutoff_date_, m_adj=1.0): # load to adjusted data to the table # read table files '~/Forecasts/par/' + 'table_output_' gcols = [ 'dim_business_unit', 'dim_language', 'dim_tier', 'dim_channel', 'time_interval' ] t_list, max_ds_ = list(), None for d_ in dr_: fname_ = os.path.expanduser('~/Forecasts/par/table_output_' + str(d_.date())) + '.par' s_ut.my_print('rolling date: ' + str(d_.date()) + ' fname: ' + str(fname_)) if os.path.exists(fname_): fx = p_ut.read_df(fname_) p_ut.set_week_start( fx, tcol='fcst_date_inv_endings') # week_starting patch # week_starting patch df_cols__ = fx.columns if 'ds_week_ending' in df_cols__ and 'ds_week_starting' not in df_cols__: fx['ds_week_ending'] = pd.to_datetime(fx['ds_week_ending']) fx['ds_week_starting'] = fx[ 'ds_week_ending'] - pd.to_timedelta(6, unit='D') fv = process_w_df(fx, cutoff_date_, 'fcst_date_inv_ending', gcols + ['run_date_inv_ending']) max_ds_ = fv['fcst_date_inv_ending'].max( ) if max_ds_ is None else min(max_ds_, fv['fcst_date_inv_ending'].max()) t_list.append(fv) tdf = pd.concat(t_list, axis=0) t_fdf = tdf.groupby(gcols + ['fcst_date_inv_ending']).apply( lambda x: x['ticket_count'].mean()).reset_index() t_fdf.rename(columns={0: 'ticket_count'}, inplace=True) avg_tdf = t_fdf[t_fdf['fcst_date_inv_ending'] <= max_ds_].copy() avg_tdf['run_date_inv_ending'] = str(cutoff_date_.date()) avg_tdf.reset_index(inplace=True) avg_tdf.rename(columns={'index': 'fcst_horizon'}, inplace=True) avg_tdf['fcst_date_inv_ending'] = avg_tdf[ 'fcst_date_inv_ending'].dt.date.astype(str) avg_tdf['ticket_count'] *= m_adj print( '******* saving data to load to sup.dim_cx_ticket_forecast >>>>>>>>>>>>>>' ) p_ut.save_df(avg_tdf, '~/my_tmp/tab_data_' + str(cutoff_date_.date())) print(888888888888888888888888) print('---------------- SKIPPING TABLE ---------------------') ret = -1 # ret = t2t.to_table(avg_tdf, str(cutoff_date_.date()), 'sup.dim_cx_ticket_forecast') # 'josep.dim_ticket_facst_test # 'sup.dim_cx_ticket_forecast' if ret == -1: s_ut.my_print('ERROR: table push failed') return ret
def get_fcast_(ts_name, cutoff_date, e_date): # get forecast froot = '~/my_tmp/fbp/' fname = froot + 'lang_fcast_' + ts_name + '_' + str(cutoff_date.date()) fcast_df = p_ut.read_df(fname) if fcast_df is None: s_ut.my_print('ERROR: no forecasts for ' + str(ts_name) + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() elif fcast_df['ds'].max() <= e_date: s_ut.my_print('ERROR: no forecasts for ' + str(ts_name) + ' and cutoff date ' + str(cutoff_date.date()) + ' and horizon ' + str(e_date.date())) sys.exit() else: p_ut.set_week_start(fcast_df, tcol='ds') # week_starting patch return fcast_df
def get_cfg_data(ts_name, cfg_cols, p_col): # read all the cfgs and set the cfg_idx t_name = 'sup.fct_cx_forecast_config' qry = 'select * from ' + t_name + ';' q_file = '/tmp/read_cfg_' + ts_name + '.hql' with open(q_file, 'w') as f: f.write(qry) s_ut.my_print('pid: ' + str(os.getpid()) + ' in query file: ' + q_file) fout = None ret = hql.run_hql((q_file, q_file), fout) if ret == -1: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: Query failed. No configs found') sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' fcast cfg file: ' + ret) cfg_df = p_ut.read_df(ret, sep='\t') p_ut.set_week_start(cfg_df, tcol='ds') # week_starting patch if cfg_df is None or len(cfg_df) == 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data for query: ' + str(qry)) sys.exit() dcol = { x: x.replace(t_name.split('.')[-1] + '.', '') for x in cfg_df.columns } cfg_df.rename(columns=dcol, inplace=True) cfg_df['cutoff'] = pd.to_datetime(cfg_df['cutoff']) cfg_df = cfg_df[(cfg_df['ts_name'] == ts_name)] cfg_df = cfg_df[cfg_df[p_col] > 0.0].copy() cfg_df.fillna('None', inplace=True) cfg_df['cfg_str'] = cfg_df.apply( lambda x: json.dumps(x[cfg_cols].to_dict()), axis=1) z = cfg_df['cfg_str'].drop_duplicates() zf = pd.DataFrame(z) zf.reset_index(inplace=True, drop=True) zf.reset_index(inplace=True) zf.columns = ['cfg_idx', 'cfg_str'] df = cfg_df.merge(zf, on=['cfg_str'], how='left') df['language'].replace(['Mandarin_Offshore', 'Mandarin_Onshore'], 'Mandarin', inplace=True) # Mandarin need to be fixed later df.drop_duplicates(inplace=True) p_ut.save_df(df, '~/my_tmp/rk_df_' + ts_name) # df = p_ut.read_df('~/my_tmp/rk_df_' + ts_name) return df
def get_actuals_(ts_name, ts_cfg, e_date): # get actuals froot = ts_cfg['data_path'].split('/') a_dir = os.path.expanduser('/'.join(froot[:-1])) + '/' fname = froot[-1] actuals_df = None for f in os.listdir(a_dir): if fname in f: s_ut.my_print('actuals file: ' + a_dir + str(f.split('.')[0])) actuals_df = p_ut.read_df(a_dir + f.split('.')[0]) if actuals_df is not None and actuals_df['ds'].max() >= e_date: break if actuals_df is None: s_ut.my_print('ERROR: no actuals for ' + ts_name + ' and horizon ' + str(e_date.date())) sys.exit() else: p_ut.set_week_start(actuals_df, tcol='ds') # week_starting patch return actuals_df
def get_fcast_cfg(ts_name, cutoff_date): fdir = '~/my_tmp/cfg_sel/' fperf = os.path.expanduser(fdir + 'cfg_sel_' + ts_name + '_' + cutoff_date) fidx = os.path.expanduser(fdir + 'cfg_idx_' + ts_name + '_' + cutoff_date) df_cfg = p_ut.read_df(fidx) p_ut.set_week_start(df_cfg, tcol='ds') # week_starting patch dfp = p_ut.read_df(fperf) p_ut.set_week_start(dfp, tcol='ds') # week_starting patch f_list = list() for l, f in dfp.groupby('language'): tf = f.nsmallest(n=1, columns=['f_err']) cfg_list = list(tf.loc[tf.index[0], 'cfg_idx'][0]) print(l) print(cfg_list) fi = df_cfg[(df_cfg['language'] == l) & (df_cfg['cfg_idx'].isin(cfg_list))] fi.drop('f_err', axis=1, inplace=True) fi.drop_duplicates(inplace=True) print(fi) f_list.append(fi) return pd.concat(f_list) if len(f_list) > 0 else None
def get_fcast(cutoff_date_, froot, months=3): # get the fcast issued <months> ago f_month = 1 + (cutoff_date_.month - months) % 12 # fcast issue month yr = cutoff_date_.year if f_month < cutoff_date_.month else cutoff_date_.year - 1 dm = pd.to_datetime(str(yr) + '-' + str(f_month) + '-01') # 1st day of issue month wd = dm.weekday() fcast_sat = dm - pd.to_timedelta( wd + 2, unit='D') if wd < 5 else dm - pd.to_timedelta(wd - 5, unit='D') fcast_f = froot + str(fcast_sat.date()) try: fdf = p_ut.read_df(os.path.expanduser(fcast_f)) except OSError: s_ut.my_print('file not found: ' + froot) return None if fdf is None: return None else: p_ut.set_week_start(cfg_df, tcol='ds') # week_starting patch fdf.rename(columns={'ticket_count': 'forecasted_count'}, inplace=True) s_ut.my_print('getting forecast from ' + fcast_f) return fdf
def prepare_regs(r_name, rcfg, cutoff_date, fcast_days, init_date): s_ut.my_print('pid: ' + str(os.getpid()) + ' preparing regressor ' + str(r_name)) in_file, r_col_dict, key_cols = rcfg['data_path'], rcfg['r_col'], rcfg.get( 'key_cols', None) # regressors: set deterministic indicators if r_name == 'peak': # peak season indicator. No clean up, imputation or forecast r_col = list(r_col_dict.keys())[0] # peaks df = pd.DataFrame({ 'ds': pd.date_range(start=pd.to_datetime(init_date), end=pd.to_datetime(cutoff_date) + pd.to_timedelta(fcast_days, unit='D'), freq='D') }) df[r_col] = df['ds'].apply( lambda x: 1 if x.month_name() in ['July', 'August'] else 0) regressors.IndicatorRegressor('peak', 'peak', 'ds', init_date, cutoff_date, ['July', 'August'], fcast_days, dim_cols=None) return df # other regressors: clean up, imputation and forecast (later) r_file = d_proc.get_data_file(rcfg['data_path'], cutoff_date) if r_file is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: date ' + str(cutoff_date.date()) + ' has no data for regressor ' + r_name) return None else: s_ut.my_print('pid: ' + str(os.getpid()) + ' found data file for regressor ' + r_name + ' and date ' + str(cutoff_date.date()) + ': ' + r_file) rdf = p_ut.read_df(r_file) p_ut.set_week_start(rdf, tcol='ds') # week_starting patch rdf = rdf[(rdf['ds'] >= pd.to_datetime(init_date)) & (rdf['ds'] <= pd.to_datetime(cutoff_date))].copy() if key_cols is not None: # get only relevant data for c, v in key_cols.items(): rdf = rdf[rdf[c] == v] rdf['ceiling'] = rcfg.get('ceiling', 1) rdf['floor'] = rcfg.get('floor', 0) if r_name == 'contact-rate': if len(rdf) > 0: dim_cols = 'language' if 'language' in rdf.columns else None regressors.Regressor('contact-rate', 'contact_rate', 'ds', rdf, rcfg, init_date, cutoff_date, fcast_days, dim_cols=dim_cols) if 'language' in rdf.columns: return rdf[['ds', 'language', 'contact_rate', 'ceiling', 'floor']] else: return rdf[['ds', 'contact_rate', 'ceiling', 'floor']] elif r_name == 'tenure': if len(rdf) > 0: regressors.Regressor('tenure', 'tenure_days', 'ds', rdf, rcfg, init_date, cutoff_date, fcast_days, dim_cols=['language']) return rdf[['ds', 'language', 'tenure_days']] elif r_name == 'bookings' or r_name == 'checkins': if len(rdf) > 0: regressors.Regressor(r_name, r_name[:-1] + '_count', 'ds', rdf, rcfg, init_date, cutoff_date, fcast_days, dim_cols=['language']) return rdf else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: unknown regressor: ' + str(r_name)) return None
def main(ts_name_, cutoff_date_): cfg_cols = ['growth', 'y_mode', 'w_mode', 'r_mode', 'xform', 'h_mode', 'training', 'do_res', 'changepoint_range'] upr_horizon, lwr_horizon = 112, 84 lbl = ts_name_ + '_' + cutoff_date_ if ts_name_ == 'phone-inbound-vol': fname = dtp.get_data_file('~/my_tmp/cleaned/phone-vol_cleaned_', cutoff_date_) interaction_type = 'inbound' else: fname = dtp.get_data_file('~/my_tmp/cleaned/phone-aht_cleaned_', cutoff_date_) interaction_type = 'inbound' if 'inbound' in ts_name_ else 'outbound' # ###################################################################### # ###################################################################### # actuals s_ut.my_print('pid: ' + str(os.getpid()) + ' actuals file: ' + str(fname)) q_df = pd.read_parquet(fname) # week_starting patch df_cols_ = q_df.columns if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_: q_df['ds_week_ending'] = pd.to_datetime(q_df['ds_week_ending']) q_df['ds_week_starting'] = q_df['ds_week_ending'] - pd.to_timedelta(6, unit='D') q_df['ds'] = pd.to_datetime(q_df['ds'].values) w_col = 'y' if 'vol' in ts_name_ else 'calls' # daily actuals (language level) if ts_name_ == 'phone-inbound-vol': q_df = q_df.groupby(['ds','language']).agg({'offered': np.sum, 'accepted': np.sum, 'abandons': np.sum}).reset_index() else: q_df = q_df[q_df['interaction_type'] == interaction_type].copy() q_df = q_df.groupby(['ds', 'language']).agg({'calls': np.sum, 'agent_mins': np.sum}).reset_index() a_ddf, ctype = set_demand(q_df.copy(), 10, ts_name_) w_df = a_ddf[a_ddf['ds'] <= cutoff_date_].groupby('language').agg({w_col: np.sum}).reset_index() w_df.columns = ['language', 'weight'] p_ut.save_df(a_ddf, '~/my_tmp/a_daily_df_' + lbl) # weekly level: use week starting if ts_name_ == 'phone-inbound-vol': m_df = q_df.groupby(pd.Grouper(key='ds', freq='W-SUN')).agg({'offered': np.sum, 'accepted': np.sum, 'abandons': np.sum}).reset_index() else: m_df = q_df.groupby(pd.Grouper(key='ds', freq='W-SUN')).agg({'calls': np.sum, 'agent_mins': np.sum}).reset_index() a_wdf_, ctype = set_demand(m_df, 10, ts_name_) a_wdf = a_wdf_.copy() horizon_date = min(pd.to_datetime(cutoff_date_) + pd.to_timedelta(upr_horizon, unit='D'), a_wdf['ds'].max()) a_wdf['ds_week_ending'] = a_wdf['ds'] + pd.to_timedelta(6, unit='D') # switch to week ending so that we do not have incomplete weeks at end a_wdf = a_wdf[(a_wdf['ds_week_ending'] <= horizon_date) & (a_wdf['ds_week_ending'] > cutoff_date_)].copy() a_wdf.drop('ds', axis=1, inplace=True) a_wdf['ts_name'] = ts_name_ p_ut.save_df(a_wdf, '~/my_tmp/a_weekly_df_' + lbl) # ###################################################################### # ###################################################################### # ###################################################################### # ###################################################################### # DS forecasts: select the top fcast cfgs for each language, score them based on past performance and forecast them sdir = '~/my_tmp/cfg_sel/' df_best = p_ut.read_df(sdir + 'cfg_best_' + ts_name_ + '_' + cutoff_date_) # best ensembles by idx p_ut.set_week_start(df_best, tcol='ds') # week_starting patch z = df_best[['language', 'cfg_idx']].copy() z.set_index('language', inplace=True) dx = z.to_dict()['cfg_idx'] dx = {k: list(v) for k, v in dx.items()} df_idx = p_ut.read_df(sdir + 'cfg_idx_' + ts_name_ + '_' + cutoff_date_) # map cfg_idx to fcast cfg p_ut.set_week_start(df_idx, tcol='ds') # week_starting patch df_idx = df_idx[['language', 'cfg_idx'] + cfg_cols].copy() df_idx.drop_duplicates(inplace=True) # fix None for fcasts for c in cfg_cols: df_idx[c] = df_idx[c].apply(lambda x: None if x == 'None' else x) df_idx['h_mode'] = df_idx['h_mode'].apply(lambda x: True if x == 1 else False) df_idx['do_res'] = df_idx['do_res'].apply(lambda x: True if x == 1 else False) cfg_df = pd.concat([lf[lf['cfg_idx'].isin(dx[l])] for l, lf in df_idx.groupby('language')], axis=0) # run the fcasts for the selected cfg's file_out = lfc.main(ts_name_, cutoff_date_, cfg_cols, to_db=False, df_cfg=cfg_df.copy(), is_mp=True) # , is_fcast=False) if file_out is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no fcast file returned') sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' +++++++++++++ completed forecasts:: file: ' + str(file_out) + ' +++++++++++++++ ') fdf = pd.read_parquet(file_out) # week_starting patch df_cols_ = fdf.columns if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_: fdf['ds_week_ending'] = pd.to_datetime(fdf['ds_week_ending']) fdf['ds_week_starting'] = fdf['ds_week_ending'] - pd.to_timedelta(6, unit='D') print(fdf.head()) # make sure fdf and perf_df have the same cfg's # fdf_list = list(fdf['cfg_idx'].unique()) # perf_df = perf_df[perf_df['cfg_idx'].isin(fdf_list)].copy() # cfg_df = cfg_df[cfg_df['cfg_idx'].isin(fdf_list)].copy() # arr = [int(x * num_cfg_) for x in [1.0, 0.75, 0.5, 0.25, 0.125]] # arg_list = [[fdf, a_ddf, w_df, cfg_df, ts_name_, cutoff_date_, horizon_date] for k in arr if k > 1] # m_list = s_ut.do_mp(get_models, arg_list, is_mp=True, cpus=len(arr), do_sigkill=True) # list of dicts # m_list = get_models(fdf, a_ddf, w_df, cfg_df, ts_name_, cutoff_date_, horizon_date) m_list = get_models(fdf, a_ddf, w_df, ts_name_, cutoff_date_, horizon_date) s_ut.my_print('pid: ' + str(os.getpid()) + ' ============== main: get_models complete. appending results ==========') if len(m_list) > 0: d_out = dict() for dv in m_list: for k, fname in dv.items(): if k not in d_out.keys(): d_out[k] = list() fz = p_ut.read_df(fname) p_ut.set_week_start(fz, tcol='ds') # week_starting patch d_out[k].append(fz) return {k: pd.concat(d_out[k], axis=0) for k, arr in d_out.items()} else: return None
def main(argv): print(argv) time_scale = 'W' # reset for daily ticket data if len(sys.argv) == 2: run_date = sys.argv[ 1] # at least 3 days after last Saturday with actual data with_bu = True s_ut.my_print( 'WARNING: rerun not set in command line. Assuming no rerun') elif len(sys.argv) == 3: _, run_date, bu = sys.argv # at least 3 days after last Saturday with actual data with_bu = bool(int(bu)) elif len(sys.argv) == 1: with_bu = True run_date = str(pd.to_datetime('today').date()) else: print('invalid args: ' + str(sys.argv)) sys.exit() cutoff_date = tm_ut.get_last_sat(run_date) # set to last saturday if time_scale == 'W': upr_horizon, lwr_horizon = 75, None fcast_days = 7 * upr_horizon # regardless of time_scale inc_start, inc_end = 4, 0 else: upr_horizon, lwr_horizon = 75 * 7, None fcast_days = upr_horizon inc_start, inc_end = 28, 0 fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D') # get actuals act_df = p_ut.read_df('~/my_tmp/tix_act_df_' + str(cutoff_date.date())) if act_df is None: s_ut.my_print('ERROR: No actuals found') sys.exit() p_ut.set_week_start(act_df, tcol='ds') # week_starting patch # get lang fcast froot = '~/my_tmp/fbp_tix_' fname = froot + 'lwbu_fcast_' if with_bu is True else froot + 'lnbu_fcast_' fcast_df = p_ut.read_df(fname + str(cutoff_date.date())) if fcast_df is None: s_ut.my_print('ERROR: No fcast found') sys.exit() p_ut.set_week_start(fcast_df, tcol='ds') # week_starting patch b_cols = ['agent_sector', 'channel'] g_cols = ['language'] if with_bu is False: b_cols.append('business_unit') fcast_df.drop('business_unit', inplace=True, axis=1) # all None else: g_cols.append('business_unit') s_ut.my_print( '------------------------- start biz level forecast from cutoff date ' + str(cutoff_date.date()) + ' to forecast date ' + str(fcast_date.date()) + ' with business columns: ' + str(b_cols) + ' ------------') b_fcast = biz_fcast(fcast_df, act_df, g_cols, b_cols, cutoff_date, time_scale, inc_start, inc_end) if b_fcast is not None: # save all the fcats for eahc fcast cfg froot = '~/my_tmp/fbp_tix_' fname = froot + 'wbu_b_fcast_' if with_bu is True else froot + 'nbu_b_fcast_' p_ut.save_df(b_fcast, fname + str(cutoff_date.date())) else: print('ERROR: no business fcast') sys.exit() # final fcast (ens_avg) ens_df = ens_fcast(b_fcast, act_df, cutoff_date, g_cols, b_cols) froot = '~/my_tmp/fbp_tix_' fname = froot + 'wbu_e_fcast_' if with_bu is True else froot + 'nbu_e_fcast_' p_ut.save_df(ens_df, fname + str(cutoff_date.date())) print('++++++++++++++ Error Summary ++++++++++++') # check for language error fdf = ens_df.groupby(['ds', 'language']).agg({ 'y_pred': np.sum }).reset_index() months = 3 m_start = pd.to_datetime( str(cutoff_date.year) + '-' + str(cutoff_date.month) + '-01') + pd.DateOffset(months=months + 1) end_date = tm_ut.last_saturday_month(m_start) # max date for err check collect_date = cutoff_date - pd.DateOffset(months=months) start_date = end_date - pd.to_timedelta( 2, unit='W') # start date for error check a_df, _ = t_ut.get_actuals( end_date, collect_date) # actuals from collect date to end_date fa = t_ut.set_act(a_df, ['language']) # clean TS for each language fa = fa[(fa['ds'] > start_date) & (fa['ds'] <= end_date)].copy() z = fa.merge(fdf, on=['ds', 'language'], how='left') z = z[(z['y_pred'] > 0) & z['ticket_count'] > 0].copy() z_lang = z.groupby('language').agg({ 'ticket_count': np.sum, 'y_pred': np.sum }).reset_index() z_all = pd.DataFrame({ 'language': ['All'], 'ticket_count': [z_lang['ticket_count'].sum()], 'y_pred': [z_lang['y_pred'].sum()] }) z_lang = pd.concat([z_all, z_lang], axis=0) z_lang['err'] = np.abs((z_lang['y_pred'] / z_lang['ticket_count']) - 1) print(z_lang) # t_ut.err_chk(ens_df, cutoff_date, [['language']], ycol='y_pred', months=3) print('DONE')
def prepare_regs(r_name, rcfg, cutoff_date, fcast_days, int_type, init_date): s_ut.my_print('pid: ' + str(os.getpid()) + ' preparing regressor ' + str(r_name)) in_file, r_col_dict, key_cols = rcfg['data_path'], rcfg['r_col'], rcfg.get( 'key_cols', None) # regressors: set deterministic indicators if r_name == 'peak': # peak season indicator. No clean up, imputation or forecast r_col = list(r_col_dict.keys())[0] # peaks df = pd.DataFrame({ 'ds': pd.date_range(start=init_date, end=cutoff_date + pd.to_timedelta(fcast_days, unit='D'), freq='D') }) df[r_col] = df['ds'].apply( lambda x: 1 if x.month_name() in ['July', 'August'] else 0) return regressors.IndicatorRegressor('peak', 'peak', 'ds', init_date, cutoff_date, ['July', 'August'], fcast_days) else: # other regressors r_file = d_proc.get_data_file(rcfg['data_path'], cutoff_date) if r_file is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: date ' + str(cutoff_date.date()) + ' has no data for regressor ' + r_name) return None else: s_ut.my_print('pid: ' + str(os.getpid()) + ' found data file for regressor ' + r_name + ' and date ' + str(cutoff_date.date()) + ': ' + r_file) rdf = p_ut.read_df(r_file) p_ut.set_week_start(rdf, tcol='ds') # week_starting patch rdf = rdf[(rdf['ds'] >= pd.to_datetime(init_date)) & (rdf['ds'] <= cutoff_date)].copy() if rdf['ds'].max() < pd.to_datetime(cutoff_date): s_ut.my_print('WARNING: ' + r_name + ' max date (' + str(rdf['ds'].max().date()) + ') is smaller than cutoff date (' + str(cutoff_date.date()) + ')') elif len(rdf) > 0: if 'interaction_type' in rdf.columns: rdf = rdf[rdf['interaction_type'] == int_type].copy() if r_name == 'contact-rate': if rdf['contact_rate'].min( ) == 0.0: # would mean no inbound tickets zmin = rdf[rdf['contact_rate'] > 0.0]['contact_rate'].min() / 10.0 rdf['contact_rate'].replace(0.0, zmin, inplace=True) return regressors.Regressor( 'contact-rate', 'contact_rate', 'ds', rdf[['ds', 'language', 'contact_rate']], rcfg, init_date, cutoff_date, fcast_days) elif r_name == 'tenure': rdf = rdf.groupby(['ds', 'language']).agg({ 'tenure_days': np.sum }).reset_index() return regressors.Regressor('tenure', 'tenure_days', 'ds', rdf, rcfg, init_date, cutoff_date, fcast_days) elif r_name == 'bookings' or r_name == 'checkins': return regressors.Regressor(r_name, r_name[:-1] + '_count', 'ds', rdf, rcfg, init_date, cutoff_date, fcast_days) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: unknown regressor: ' + str(r_name)) return None