def fcast_idx(f_df): fg = pd.DataFrame(f_df['dim_cfg'].copy()) fg.drop_duplicates(inplace=True) fg.reset_index(drop=True, inplace=True) fg.reset_index(inplace=True) p_ut.save_df(fg, '~/my_tmp/f_idx') return fg
def perf_smry(perf_df, cutoff_date, time_scale, ts_name, upr, lwr): # print smry and save upr_horizon = cutoff_date + pd.to_timedelta(upr, unit=time_scale) lwr_horizon = cutoff_date + pd.to_timedelta(lwr, unit=time_scale) if perf_df is not None: perf_df.sort_values(by='language', inplace=True) perf_df.reset_index(inplace=True, drop=True) perf_df['ts_name'] = ts_name perf_df['cutoff'] = cutoff_date s_ut.my_print( '########################### cutoff: ' + str(cutoff_date.date()) + ' ts_name: ' + str(ts_name) + ' performance between ' + str(lwr_horizon.date()) + ' (included) and ' + str(upr_horizon.date()) + ' (included) ##########################################') perf_df.sort_values(by=['language', 'err'], inplace=True) print(perf_df.head(10)) p_ut.save_df( perf_df, '~/my_tmp/fbp/lang_perf_' + ts_name + '_' + str(cutoff_date.date())) else: s_ut.my_print( 'WARNING: no actuals to compute fcast errors for the between ' + str(lwr_horizon.date()) + ' (included) and ' + str(upr_horizon.date()) + ' (included) ' + ' for cutoff: ' + str(cutoff_date.date()) + ' and ts_name: ' + str(ts_name))
def back_fcast(self, f_df): # undo data transforms # >>>>>>>>>>> only yhat's will be correct in the natural scale for non-trivial transforms <<<<<<<<<<< if self.verbose: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: only yhat (all levels) back-transforms will be correct') levels = ['', '_upper', '_lower'] for lvl in levels: # ###################### should only inverse-xform the terms that exist (eg invert multiplicative_terms only if something is multiplicative) ############## for c in ['yhat', 'trend', 'additive_terms', 'multiplicative_terms']: y_var = self.xform_obj.fcast_var(f_df[[c + '_upper', c + '_lower']].copy(), self.prophet_obj.interval_width) # use the same var for upper and lower v = f_df[c + lvl].copy() f_df[c + lvl] = self.xform_obj.inverse_transform(v, y_var, lbl=c + lvl) # only yhat's will be correct in the natural scale for non-trivial transforms if f_df['yhat'].isnull().sum() > 0.05 * len(f_df): # more than pct nulls s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: fcast failed: too many NaNs for cfg ' + str(self.f_cfg)) s_ut.my_print(str(f_df['yhat'].isnull().sum()) + ' nulls on ' + str(len(f_df)) + ' rows') print(f_df.head(10)) print(f_df.isnull().sum()) p_ut.save_df(f_df, '~/my_tmp/f_df') return None else: f_df['yhat'].interpolate(inplace=True, limit_direction='both') f_df['yhat_upper'].interpolate(inplace=True, limit_direction='both') f_df['yhat_lower'].interpolate(inplace=True, limit_direction='both') if self.verbose: s_ut.my_print('pid: ' + str(os.getpid()) + ' back_fcast: forecast OK with cfg ' + str(self.f_cfg)) return f_df.copy()
def interpolate_(self, y, yt, nan_pct=0.2): # y: inverse-transformed values (values in natural scale) # yt: pre-inverse transform (values in transformed scale) if y is None: return None else: yx = np.reshape(y, (1, -1))[0] if self.method is not None else y nulls = pd.Series(yx).isnull().sum() pct = 100.0 * np.round(nulls / len(yx), 2) if nulls > nan_pct * np.ceil(len(yx)): su.my_print('WARNING: Too many NaN to interpolate for label ' + str(self.lbl) + ': ' + str(nulls) + ' out of ' + str(len(yx)) + ' (' + str(pct) + '%) data points and lambda ' + str(self.lmbda)) f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)}) f['lmbda'] = self.lmbda p_ut.save_df(f, '~/my_tmp/interpolDF') return None elif 0 < nulls <= nan_pct * np.ceil( len(yx)): # interpolate yhat if some NaNs su.my_print('WARNING: interpolating for label ' + str(self.lbl) + ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) + ' data points (' + str(pct) + '%)') st = pd.Series(yx) sint = st.interpolate(limit_direction='both') yhat = sint.values ys = np.reshape(yhat, (1, -1)) return ys[0] else: # all OK return y
def data_check(df, name): df.replace([np.inf, -np.inf], np.nan, inplace=True) u_vals = [c for c in df.columns if df[c].nunique() <= 1] if df.isnull().sum().sum() > 0 or len(u_vals) > 0: p_ut.save_df(df, '~/my_tmp/f_data') s_ut.my_print('ERROR: invalid data for ' + str(name)) sys.exit()
def get_shift(fl, cu_sun, upr, lwr): # extra shift to max correlation lg = fl.loc[fl.index[0], 'language'] fl.drop('language', inplace=True, axis=1) zn = fl.copy() zn.set_index('ds', inplace=True) zs = zn[['y']].shift(364, freq='D') # shift prior year data to cutoff date f = zn.merge(zs, left_index=True, right_index=True, how='left') f.rename(columns={'y_y': 'y_shifted_', 'y_x': 'y'}, inplace=True) ws_adj = 4 # max adj shift lx = [_corr(f.copy(), cu_sun, v, upr) for v in range(-ws_adj, ws_adj + 1)] lx.sort(key=lambda x: x[0]) rho_max, ws_opt, le = lx[-1] print('language: ' + lg + ' week shift: ' + str(ws_opt) + ' best corr: ' + str(rho_max) + ' len: ' + str(le)) f['y_shifted'] = f['y_shifted_'].shift(ws_opt) f.drop('y_shifted_', axis=1, inplace=True) f.dropna(inplace=True) fx = lm_mdl(f, cu_sun, upr, lwr) if fx is not None: fx['opt_shift'] = ws_opt fx['corr'] = rho_max p_ut.save_df(fx, '~/my_tmp/fshift_' + lg) return fx else: return None
def perf_err(self): if self.mdl is None: return { 'ens': self.name, 'err': np.nan, 'n_features': len(self.features_), 'df': self.df, 'alpha': self.alpha, 'l1_ratio': self.lbda } else: d_mdl = { 'ens': self.name, 'n_features': len(self.features_), 'df': self.df, 'alpha': self.alpha, 'l1_ratio': self.lbda } y_test = self.y_scaler.inverse_transform( self.y_test) # the quantity to be forecasted x_test = self.f_test[self.features_].copy() p_ut.save_df(x_test, '~/my_tmp/x_test') # actuals regression based error (bad performance) # ys_pred = self.mdl.predict(self.X_test) # y_pred = self.y_scaler.inverse_transform(ys_pred) # d_mdl['mdl_err'] = self._perf_err(y_test, y_pred) # plain avg based error y_avg = x_test.mean(axis=1) d_mdl['avg_err'] = self._perf_err(y_test, y_avg.values) # IC based for ic in ['aic', 'aicc', 'bic', 'hqic']: _arr = np.exp((-0.5) * np.array([ self.IC(y_test, self.f_test[c].values, self.df, method=ic) for c in self.features_ ])) weights = _arr / np.sum(_arr) _avg = np.average(x_test, axis=1, weights=weights) d_mdl[ic + '_err'] = self._perf_err(y_test, _avg) # Regression on adj_y_shifted # variables need to be centered and scaled? lm = l_mdl.LinearRegression(normalize=False).fit( x_test, self.f_test['adj_y_shifted'].values) y_pred = lm.predict(x_test) d_mdl['mse_err'] = self._perf_err(y_test, y_pred) # adj err: use adj_y_shifted as forecast d_mdl['adj_err'] = self._perf_err( y_test, self.f_test['adj_y_shifted'].values) return d_mdl
def table_load(dr_, cutoff_date_, m_adj=1.0): # load to adjusted data to the table # read table files '~/Forecasts/par/' + 'table_output_' gcols = [ 'dim_business_unit', 'dim_language', 'dim_tier', 'dim_channel', 'time_interval' ] t_list, max_ds_ = list(), None for d_ in dr_: fname_ = os.path.expanduser('~/Forecasts/par/table_output_' + str(d_.date())) + '.par' s_ut.my_print('rolling date: ' + str(d_.date()) + ' fname: ' + str(fname_)) if os.path.exists(fname_): fx = p_ut.read_df(fname_) p_ut.set_week_start( fx, tcol='fcst_date_inv_endings') # week_starting patch # week_starting patch df_cols__ = fx.columns if 'ds_week_ending' in df_cols__ and 'ds_week_starting' not in df_cols__: fx['ds_week_ending'] = pd.to_datetime(fx['ds_week_ending']) fx['ds_week_starting'] = fx[ 'ds_week_ending'] - pd.to_timedelta(6, unit='D') fv = process_w_df(fx, cutoff_date_, 'fcst_date_inv_ending', gcols + ['run_date_inv_ending']) max_ds_ = fv['fcst_date_inv_ending'].max( ) if max_ds_ is None else min(max_ds_, fv['fcst_date_inv_ending'].max()) t_list.append(fv) tdf = pd.concat(t_list, axis=0) t_fdf = tdf.groupby(gcols + ['fcst_date_inv_ending']).apply( lambda x: x['ticket_count'].mean()).reset_index() t_fdf.rename(columns={0: 'ticket_count'}, inplace=True) avg_tdf = t_fdf[t_fdf['fcst_date_inv_ending'] <= max_ds_].copy() avg_tdf['run_date_inv_ending'] = str(cutoff_date_.date()) avg_tdf.reset_index(inplace=True) avg_tdf.rename(columns={'index': 'fcst_horizon'}, inplace=True) avg_tdf['fcst_date_inv_ending'] = avg_tdf[ 'fcst_date_inv_ending'].dt.date.astype(str) avg_tdf['ticket_count'] *= m_adj print( '******* saving data to load to sup.dim_cx_ticket_forecast >>>>>>>>>>>>>>' ) p_ut.save_df(avg_tdf, '~/my_tmp/tab_data_' + str(cutoff_date_.date())) print(888888888888888888888888) print('---------------- SKIPPING TABLE ---------------------') ret = -1 # ret = t2t.to_table(avg_tdf, str(cutoff_date_.date()), 'sup.dim_cx_ticket_forecast') # 'josep.dim_ticket_facst_test # 'sup.dim_cx_ticket_forecast' if ret == -1: s_ut.my_print('ERROR: table push failed') return ret
def merge_adj(df, adj_df, k_col, ts_cols, master_ts): # keeps coherence at master_ts mf = pd.concat([ set_factor(ds, kval, k_col, adj_df, ts_cols) for (ds, kval), f in df.groupby(['ds', k_col]) ], axis=0) p_ut.save_df(mf, '~/my_tmp/mf') fx = df.merge(mf, on=['ds', k_col], how='left') gx = fx.apply(row_adjust, ts_list=ts_cols, axis=1) gx[master_ts + '_tilde'] = gx[[c + '_tilde' for c in ts_cols]].sum(axis=1) return gx.copy()
def cfg_results(ada_cfg, X_train, y_train, X_pred, X_idx, y_res, xf_obj, y_col, p_col): # best AdaBoost regressor results topN = ada_cfg['topN'] obj = ada_cfg['obj'] ab_reg = AdaBoostRegressor(n_estimators=int(ada_cfg['adb_estimators']), base_estimator=DecisionTreeRegressor( max_depth=int(ada_cfg['max_depth']), min_samples_split=int( ada_cfg['min_samples_split'])), loss=ada_cfg['loss'], learning_rate=ada_cfg['learning_rate']) try: ab_reg.fit(X_train, y_train) except ValueError as e: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: could not fit: ' + str(e)) p_ut.save_df(X_train, '~/my_tmp/X_train') p_ut.save_df(y_train, '~/my_tmp/y_train') return dict() y_pred = ab_reg.predict(X_pred) # predict (transformed) y_col lf_out = pd.concat([ pd.DataFrame(X_idx, columns=['cfg_idx']), pd.DataFrame(y_pred, columns=[y_col + '_pred']) ], axis=1) # X_train y_train and X_test have same index lf_out.drop_duplicates(inplace=True) if y_res is not None: lf_out = y_res.merge(lf_out, on=['cfg_idx'], how='left') l_cfg = lf_out.nsmallest( n=topN, columns=[y_col + '_pred'] ) # can use transformed values because transforms are monotonic increasing d_best = { 'xform': xf_obj.name, 'topN': topN, 'obj': obj, 'yobj': y_col, 'yhat': l_cfg[y_col + '_pred'].mean(), 'cfg_idx': [list(l_cfg['cfg_idx'].values)] } if y_res is not None: d_best['y'] = l_cfg[y_col].mean() if p_col in l_cfg.columns: d_best[p_col] = l_cfg[p_col].mean() d_best[p_col + '_max'] = l_cfg[p_col].max() d_best.update(ada_cfg) return d_best
def prepare_data(self, data_df): # clean input data and build holiday DF # gcols_ = ['ds', 'language'] if 'language' in data_df.columns: data_df['language'].replace( ['Mandarin_Onshore', 'Mandarin_Offshore'], ['Mandarin', 'Mandarin'], inplace=True) else: data_df['language'] = 'NULL' dl_groups = data_df.groupby(['ds'] + self.gcols).agg( self.agg_dict).reset_index() df_dict, hf_dict = dict(), dict() for tpl, gf in dl_groups.groupby(self.gcols): if isinstance(tpl, str): tpl = (tpl, ) s_ut.my_print('************* data checks and hols for group: ' + str(tpl)) mf = None if self.ycol is None else d_proc.data_check( gf[['ds', self.ycol]].copy(), self.ycol, 'ds', self.cutoff_date, self.init_date, max_int=self.max_int, name=str(tpl), unit=self.time_scale) if mf is None: s_ut.my_print('WARNING: data_check failed for label ' + str(tpl)) p_ut.save_df(gf, '~/my_tmp/gf_' + str(tpl)) continue else: lang = tpl[ self.lang_idx] if self.lang_idx is not None else None d_df = self.set_demand(mf) h_df = self.get_holidays(lang) if lang is not None else None hf_dict[lang] = h_df # dict of hols DF by language (key) t_df = self.trim_outliers(d_df, h_df, self.outlier_coef, ['y'], lbl_dict=tpl) # trim outliers if t_df is not None: for ix in range(len(self.gcols)): # add the grouping cols t_df[self.gcols[ix]] = tpl[ix] if lang not in df_dict.keys(): df_dict[lang] = t_df # dict of trimmed DF by language else: zx = pd.concat([df_dict[lang].copy(), t_df], axis=0) df_dict[lang] = zx return df_dict, hf_dict
def rs_to_excel(df, pcol, ycol): # all, all - (safety + claims), all - directly, all - (safety + claims + directly) def to_spreadsheet(adf, tcol, ycol_): # top DF adf_tier = pd.pivot_table(adf, values=ycol_, index=['business_unit', 'channel', 'language'], columns=[tcol], aggfunc=sum).reset_index() adf_tier_all = pd.pivot_table(adf, values=ycol_, index=['business_unit', 'language'], columns=[tcol], aggfunc=sum).reset_index() adf_tier_all['channel'] = 'all' output_ = pd.concat([adf_tier_all, adf_tier], sort=True) output_['key'] = output_['business_unit'] + '-' + output_['channel'] + '-' + output_['language'] # spacer DF sf = pd.DataFrame(columns=output_.columns, index=range(3)) sf.loc[0] = [''] * len(output_.columns) sf.loc[2] = [''] * len(output_.columns) sf.loc[1] = [c for c in output_.columns] sf.loc[1, 'key'] = 'FORMULAS' # formulas DF pdf_ = pd.pivot_table(adf, values=ycol_, index=['channel', 'language'], columns=[tcol], aggfunc=sum).reset_index() pdf_['key'] = pdf_['channel'] + '-' + pdf_['language'] qdf = pd.DataFrame(pdf_.sum(axis=0)).transpose() qdf['key'] = 'All' fall = pd.concat([output_, sf, qdf, pdf_], axis=0, sort=True) fall.reset_index(inplace=True, drop=True) cols = fall.columns.values.tolist() _ = [cols.remove(c) for c in ['business_unit', 'channel', 'language', 'key']] cols.insert(0, 'key') return fall[cols] df[pcol] = df[pcol].dt.date.astype(str) output = dict() output['all'] = to_spreadsheet(df, pcol, ycol) df_nodirectly = df[df['service_tier'] != 'directly'] output['all - directly'] = to_spreadsheet(df_nodirectly, pcol, ycol) df_ = df[~((df['service_tier'] == 'safety') | (df['service_tier'] == 'claims'))] output['all - (safety&claims)'] = to_spreadsheet(df_, pcol, ycol) df_ = df[~((df['service_tier'] == 'safety') | (df['service_tier'] == 'claims') | (df['service_tier'] == 'directly'))] output['all - (safety&claims&directly)'] = to_spreadsheet(df_, pcol, ycol) for tr in df['service_tier'].unique(): if tr in ['Community Education', 'Resolutions 1']: print(11111111111111111) print(ycol) print(pcol) z = df.groupby(['ds', 'service_tier']).sum().reset_index() zz = z[(z['ds'] >= '2020-02-01') & (z['ds'] <= '2020-04-10')] print(zz[zz['service_tier'] == tr]) p_ut.save_df(df[df['service_tier'] == tr], '~/my_tmp/sh_df_' + tr) output[tr] = to_spreadsheet(df[df['service_tier'] == tr], pcol, ycol) return output
def fcast_errors(argv): if len(argv) == 3: ts_name, run_date = argv[ 1:] # at least 3 days after last Saturday with actual data else: print('invalid args: ' + str(sys.argv)) sys.exit() # data cfg data_cfg = FILE_PATH + '/config/data_cfg.json' with open(os.path.expanduser(data_cfg), 'r') as fp: d_cfg = json.load(fp) ts_cfg = d_cfg[ts_name] ts_cfg['name'] = ts_name ts_cfg['time_scale'] = time_scale cutoff_date = tm_ut.get_last_sat( run_date ) # set to saturday prior run_date or the run_date if a saturday e_date = cutoff_date + pd.to_timedelta( upr_horizon, unit='D') # error check end date (included) s_date = cutoff_date + pd.to_timedelta( lwr_horizon, unit='D') # error check start date (included) # get actuals actuals_df = get_actuals_(ts_name, ts_cfg, e_date) a_df_dict = dict() for w in ts_cfg['outlier_coef']: ts_cfg_ = copy.deepcopy(ts_cfg) ts_cfg_['outlier_coef'] = w a_df_dict[w] = prep_actuals(actuals_df, ts_cfg_, e_date) # get fcasts f_df = get_fcast_(ts_name, cutoff_date, e_date) # get language level errors and save err data e_list = list() for w, a_df in a_df_dict.items(): ts_cfg_ = copy.deepcopy(ts_cfg) ts_cfg_['outlier_coef'] = w e_lang_ = fcast_errors_(a_df, f_df, e_date, s_date, ts_cfg_['ycol']) e_list.append(e_lang_) e_lang = pd.concat(e_list) e_lang['ts_name'] = ts_name e_lang['cutoff'] = cutoff_date froot = '~/my_tmp/fbp/' fname = froot + 'lang_cfgs_' p_ut.save_df(e_lang, fname + ts_name + '_' + str(cutoff_date.date())) return
def fcast_regressors(lang, rf_, tcol, ycol, i_dt, f_days, do_fcast): # all regressors are forecasted using the default cfg s_ut.my_print('pid: ' + str(os.getpid()) + ' starting regressor fcast for language ' + str(lang) + ' cols: ' + str(list(rf_.columns))) if do_fcast.get(ycol, True) is True: rg = rf_[rf_['ds'] <= i_dt].copy() r = None if len(rg) > 0: if ycol == 'contact_rate': if rg[ycol].min() == 0.0: # would mean no inbound tickets zmin = rg[rg[ycol] > 0.0][ycol].min() / 10.0 rg[ycol].replace(0.0, zmin, inplace=True) fcfg = {'xform': 'logistic'} pdict = dict() for c in ['ceiling', 'floor']: if c in rg.columns: pdict[c] = rg.loc[rg.index[0], c] else: fcfg = None pdict = dict() rg.rename(columns={tcol: 'ds', ycol: 'y'}, inplace=True) f_obj = one_fcast.OneForecast(rg[['ds', 'y']], None, pdict, fcfg, f_days, time_scale='D', verbose=False) r = f_obj.forecast() if r is not None: r.rename(columns={'yhat': ycol}, inplace=True) r['language'] = lang if any(np.isinf(r[ycol])): s_ut.my_print('ERROR in reg forecast for ' + ycol + ' and language ' + str(lang)) p_ut.save_df(rg, '~/my_tmp/rg_' + lang + '_' + ycol) print(r.describe()) sys.exit() else: # nothing to fcast if rf_[tcol].max() < i_dt + pd.to_timedelta(f_days, unit='D'): print('WARNING: no forecast set but missing forecast values for ' + str(ycol) + ' Ignoring') r = None else: r = rf_[rf_[tcol] <= i_dt + pd.to_timedelta(f_days, unit='D')].copy() r['language'] = lang return r
def to_table(to_db, table, cutoff_date, ts_name, if_exists, df_out): cu_dt = str(cutoff_date.date()) df_out['cutoff'] = cutoff_date df_out['ts_name'] = ts_name file_out = p_ut.save_df(df_out, '~/my_tmp/fcast_df_' + cu_dt + '_' + ts_name) if to_db is True: partition = {'cutoff': cu_dt, 'ts_name': ts_name} df_out['ds'] = df_out['ds'].dt.date.astype(str) df_out.drop(['cutoff', 'ts_name'], axis=1, inplace=True) s_ut.my_print('Loading data to ' + table + ' for partition: ' + str(partition)) try: # presto does not work with a partition argument ap.hive.push(df_out, table=table, if_exists=if_exists, partition=partition, table_props={ 'abb_retention_days': '-1', 'abb_retention_days_reason': 'fact table. No pii' }) except: s_ut.my_print('ERROR: push to ' + table + ' failed for partition: ' + str(partition)) sys.exit() return file_out
def main(in_df, k_col, ts_list, top_ts, adjust_date): if k_col in ts_list: s_ut.my_print('ERROR: invalid k_col: ' + str(k_col) + ' cannot one of ' + str(ts_list) + ' has to be a DF column') sys.exit() adj_df_list = fcast_adj(k_col, adjust_date) if len(adj_df_list) > 0: fout = in_df.copy() for adj_df in adj_df_list: fout = merge_adj(fout, adj_df, k_col, ts_list, top_ts) p_ut.save_df(fout, '~/my_tmp/fout_' + k_col) return fout else: s_ut.my_print('WARNING: nothing to adjust for ' + k_col + ' <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return in_df
def get_cfg_data(ts_name, cfg_cols, p_col): # read all the cfgs and set the cfg_idx t_name = 'sup.fct_cx_forecast_config' qry = 'select * from ' + t_name + ';' q_file = '/tmp/read_cfg_' + ts_name + '.hql' with open(q_file, 'w') as f: f.write(qry) s_ut.my_print('pid: ' + str(os.getpid()) + ' in query file: ' + q_file) fout = None ret = hql.run_hql((q_file, q_file), fout) if ret == -1: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: Query failed. No configs found') sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' fcast cfg file: ' + ret) cfg_df = p_ut.read_df(ret, sep='\t') p_ut.set_week_start(cfg_df, tcol='ds') # week_starting patch if cfg_df is None or len(cfg_df) == 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data for query: ' + str(qry)) sys.exit() dcol = { x: x.replace(t_name.split('.')[-1] + '.', '') for x in cfg_df.columns } cfg_df.rename(columns=dcol, inplace=True) cfg_df['cutoff'] = pd.to_datetime(cfg_df['cutoff']) cfg_df = cfg_df[(cfg_df['ts_name'] == ts_name)] cfg_df = cfg_df[cfg_df[p_col] > 0.0].copy() cfg_df.fillna('None', inplace=True) cfg_df['cfg_str'] = cfg_df.apply( lambda x: json.dumps(x[cfg_cols].to_dict()), axis=1) z = cfg_df['cfg_str'].drop_duplicates() zf = pd.DataFrame(z) zf.reset_index(inplace=True, drop=True) zf.reset_index(inplace=True) zf.columns = ['cfg_idx', 'cfg_str'] df = cfg_df.merge(zf, on=['cfg_str'], how='left') df['language'].replace(['Mandarin_Offshore', 'Mandarin_Onshore'], 'Mandarin', inplace=True) # Mandarin need to be fixed later df.drop_duplicates(inplace=True) p_ut.save_df(df, '~/my_tmp/rk_df_' + ts_name) # df = p_ut.read_df('~/my_tmp/rk_df_' + ts_name) return df
def prepare_regressors(data_cfg, _cfg, d_cfg, cutoff_date, fcast_days, init_date='2016-01-01'): s_ut.my_print('************* reading regressors ********************') reg_cfg = data_cfg.get('regressors', None) if reg_cfg is None: return None arg_list = [[rname, rcfg, cutoff_date, fcast_days, init_date] for rname, rcfg in reg_cfg.items()] rf_list = s_ut.do_mp(prepare_regs, arg_list, is_mp=True, cpus=None, do_sigkill=True) arg_list, rcol_list = fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days, pd.to_datetime(init_date)) r_list = s_ut.do_mp(fcast_regressors, arg_list, is_mp=True, cpus=None, do_sigkill=True) r_list = list(filter(lambda x: x is not None, r_list)) # drop all Nones if any reg_fdf = merge_regressors( r_list, rcol_list) # merge all regressors in a single DF fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D') if reg_fdf is not None: p_ut.save_df(reg_fdf, '~/my_tmp/reg_df') s_ut.my_print('final predicted regressors: fcast date: ' + str(fcast_date.date()) + ' cutoff rate: ' + str(cutoff_date.date()) + ' fcast_days: ' + str(fcast_days) + ' gap: ' + str( max([ reg_fdf[reg_fdf['language'] == l] ['ds'].diff().dt.days.max() for l in reg_fdf['language'].unique() ])) + ' nulls: ' + str(sum([reg_fdf[c].isnull().sum() for c in rcol_list]))) else: s_ut.my_print('WARNING: no regressors available') return reg_fdf
def resi_fcast_(self, f_df, y_vals): # forecast the residuals from self by calling OneForecast again # prepare: first, inverse transform actuals and forecasted actuals back to the natural scale y = self.xform_obj.inverse_transform(y_vals['y'], 0.0, lbl='y') # inverse transformed actuals y_hat_x = f_df[(f_df['ds'] >= y_vals['ds'].min()) & (f_df['ds'] <= y_vals['ds'].max())].copy() # transformed forecasted actuals y_var = self.xform_obj.fcast_var(y_hat_x[['yhat_upper', 'yhat_lower']], self.prophet_obj.interval_width) # use the same var for upper and lower yhat = self.xform_obj.inverse_transform(y_hat_x['yhat'], y_var, lbl='yhat') # inverse transformed forecasted actuals try: res_df = pd.DataFrame({'ds': y_vals['ds'].values, 'y': y - yhat}) # residuals DF to forecast in natural scale except TypeError as msg: s_ut.my_print('WARNING: resi_fcast fails with msg: ' + str(msg)) print(y_hat_x.head()) print(y_vals.head()) return None if res_df['y'].isnull().sum() > 0.25 * len(res_df): # more than pct nulls s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: resi_fcast failed: too many NaNs in res_df: ' + str(res_df['y'].isnull().sum()) + ' nulls on ' + str(len(res_df)) + ' rows') print(res_df.head(10)) print(res_df.isnull().sum()) p_ut.save_df(res_df, '~/my_tmp/res_df') return None else: res_df['y'].interpolate(inplace=True, limit_direction='both') # ################################################################################################## # ################################### white noise test ############################################## # lags = min(10.0, len(y_vals) / 5.0) if self.f_cfg['y_mode'] is None else int(min(2.0 * 365 + 1.0, len(y_vals) / 5.0)) # is_wn = st_ut.white_noise_test(res_df['y'].values, p_thres=0.05, lags=int(lags), verbose=False) # white noise test # if is_wn is False: # residuals are not white noise # ################################################################################################## # ################################################################################################## f_cfg = copy.deepcopy(self.f_cfg) # use current cfg but adjust xform,do_res and regs f_cfg['xform'] = 'yeo-johnson' # residuals will be negative! f_cfg['do_res'] = False # no infinite recursion! f_cfg['r_mode'] = None # regs will probably not help here s_ut.my_print('pid: ' + str(os.getpid()) + ' resi_fcast: start recursive call: residual forecast') res_obj = OneForecast(res_df.copy(), None, copy.deepcopy(self.prophet_dict), f_cfg, self.horizon, time_scale=self.time_scale, verbose=self.verbose) ret = res_obj.forecast() # residuals forecast in natural scale if ret is None: ret = [None] s_ut.my_print('pid: ' + str(os.getpid()) + ' resi_fcast: end recursive call: residual forecast') return ret[0] # n_df
def lang_perf(lg, f_data, a_perf, this_cutoff, upr, lwr): p_ut.save_df(a_perf, '~/my_tmp/a_perf_' + lg) p_ut.save_df(f_data, '~/my_tmp/f_data_' + lg) d_list = list() a_perf.sort_values(by='a_err', inplace=True) a_perf.reset_index(inplace=True, drop=True) cfg, b_err = a_perf.loc[a_perf.index[0], ] # OMP # reg_omp = OMP('OMP', f_data, this_cutoff, upr, lwr) # _ = gen_output(reg_omp, d_list) # lasso reg_lasso = LassoRegressor('lasso', f_data, this_cutoff, upr, lwr) lasso_mask = gen_output(reg_lasso, d_list, b_err, append=False) # ENET verbose = False reg_enet = EnetOpt('enet', f_data, this_cutoff, upr, lwr, max_evals=100, verbose=verbose) enet_mask = gen_output(reg_enet, d_list, b_err, append=False) f_mask = enet_mask if np.sum(enet_mask) < len(f_data) else lasso_mask # Ridge reg_ridge = RidgeOpt('ridge', f_data, this_cutoff, upr, lwr, f_mask, reg_enet.ridge_par) _ = gen_output(reg_ridge, d_list, b_err, append=True) # 'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor', 'GradientBoostingRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor' # ensembles with ENET selected features # for rname in ['XGBRegressor', 'AdaBoostRegressor']: # for loss_type in ['rel', 'abs']: # reg = EnsOpt(rname, fl, cutoff_date, upr, lwr, mask, loss_type=loss_type, max_evals=200) # d_err = reg.perf_err() # d_err['ens'] += '-' + loss_type # d_list.append(d_err) return d_list
def check_ratios(ts, a_df, f_df, is_dl, col): s_ut.my_print('ratio check to ' + str(ts) + ' and DL: ' + str(is_dl)) b_df = a_df[a_df['channel'] == 'directly'].copy( ) if is_dl is True else a_df[a_df['channel'] != 'directly'].copy() a_r = get_ratios(b_df, 'ticket_count', [col]) if len(b_df) > 0 else None g_df = f_df[f_df['channel'] == 'directly'].copy( ) if is_dl is True else f_df[f_df['channel'] != 'directly'].copy() f_r = get_ratios(g_df, ts + '_tilde', [col]) if len(g_df) > 0 else None p_ut.save_df(a_df, '~/my_tmp/a_df_' + ts + '_' + str(is_dl) + '_' + col) p_ut.save_df(f_df, '~/my_tmp/f_df_' + ts + '_' + str(is_dl) + '_' + col) if a_r is not None and f_r is not None: raf = a_r.merge(f_r, on=[col, 'language'], how='left') raf.columns = [col, 'language', 'a_ratio', 'f_ratio'] raf['diff'] = np.abs(raf['f_ratio'] - raf['a_ratio']) p_ut.save_df(raf, '~/my_tmp/raf_' + ts + '_' + str(is_dl) + '_' + col) if raf['diff'].min() > 1.0e-3: raf['ts_name'] = ts raf['is_dl'] = is_dl s_ut.my_print('WARNING: ' + col + ' ratios off <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') print(raf[raf['diff'] > 0]) elif (a_r is None and f_r is not None) or (a_r is not None and f_r is None): s_ut.my_print('ERROR: error in ratios for ' + ts) sys.exit() else: # nothing to check pass
def cfg_perf(fl, upr, lwr, cu_sun): fl.reset_index(inplace=True) lg, dim_cfg, ws = fl.loc[fl.index[0], ['language', 'dim_cfg', 'opt_shift']] fx = f_ok(fl, cu_sun - pd.to_timedelta(upr, unit='W'), cu_sun + pd.to_timedelta(upr, unit='W'), 'yhat') # check fcast OK at the real cutoff date if fx is None: s_ut.my_print('WARNING: invalid cfg ' + str(dim_cfg) + ' config for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg)) p_ut.save_df(fl, '~/my_tmp/fl_' + lg + '_' + str(dim_cfg)) return None else: # actual performance for this cfg fa = fx[(fx['ds'] >= cu_sun + pd.to_timedelta(lwr, unit='W')) & (fx['ds'] <= cu_sun + pd.to_timedelta(upr, unit='W'))].copy() a_err = st_ut.wmape(fa[['y', 'yhat']]) # training performance: use last year's data adjusted g_train = fx[(fx['ds'] >= cu_sun + pd.to_timedelta(lwr, unit='W')) & (fx['ds'] <= cu_sun + pd.to_timedelta(upr, unit='W'))].copy() t_err = st_ut.wmape(g_train[['adj_y_shifted', 'yhat']], y_col='adj_y_shifted') if len(g_train) > 0 else np.nan # "training" performance for this fcast cfg if np.isnan(t_err): s_ut.my_print('WARNING: cannot get training performance of config ' + str(dim_cfg) + ' for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg)) else: s_ut.my_print('OK: get performance of config ' + str(dim_cfg) + ' for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg)) fx['train_err'] = t_err fx['a_err'] = a_err return fx
def lognorm_qfit(a): # _ts_name, _ts_key = a[3] _ts_name, _ts_key = a[2] qd, avg, std = set_fit(a) # get quantile dict and data avg dict_out = _lognorm_qfit(qd, avg, std) if ( len(qd) > 0 and min(qd.values()) > 0.0 and avg > 0.0) else None if dict_out is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: lognorm fit failed for ts_name: ' + str(_ts_name) + ' and ts_key: ' + str(_ts_key)) _ = p_ut.save_df( a[0], TempUtils.tmpfile('fit_df_' + _ts_name + '_' + _ts_key)) return None else: # fill output DF dict_out['ts_name'] = _ts_name dict_out['ts_key'] = _ts_key dict_out['dist_name'] = 'lognorm' dict_out['probs'] = [[1.0]] dict_out['pars'] = [[dict_out['sh'][0], dict_out['sc'][0]]] _ = [dict_out.pop(x, None) for x in ['sc', 'sh', 'prob']] return pd.DataFrame(dict_out)
def basic_perf(ts_name, cutoff_date, upr, lwr, init_date='2016-01-01', time_scale='W'): ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale) # actuals actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols) actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True) actuals_df.drop_duplicates(inplace=True) # not sure why there are dups. Table problem? # forecasts f_df = get_lang_fcast(ts_cfg, cutoff_date, eq=True) f_df.drop_duplicates(inplace=True) # not sure why there are dups. Table problem? f_idx = fcast_idx(f_df) f_df = f_df.merge(f_idx, on='dim_cfg', how='left') f_df.drop('dim_cfg', axis=1, inplace=True) f_df.rename(columns={'index': 'dim_cfg'}, inplace=True) df = f_df.merge(actuals_df, on=['ds', 'language'], how='left') df.drop('cutoff', axis=1, inplace=True) df.drop_duplicates(inplace=True) # not sure why there are dups p_ut.save_df(df, '~/my_tmp/df_all') # perf cu_sun = cutoff_date - pd.to_timedelta(6, unit='D') sf = df[['ds', 'language', 'y']].drop_duplicates() sf.dropna(inplace=True) f_shift = sf.groupby('language').apply(get_shift, cu_sun=cu_sun, upr=upr, lwr=lwr).reset_index() # find best shift nf = df.merge(f_shift, on=['ds', 'language', 'y'], how='left') nf.set_index(['language', 'dim_cfg'], inplace=True) # avoids drop of nuisance cols nf.dropna(inplace=True) nf.drop_duplicates(inplace=True) zperf = nf.groupby(['language', 'dim_cfg']).apply(cfg_perf, upr=upr, lwr=lwr, cu_sun=cu_sun).reset_index(drop=True) # do not groupby df.index zperf.dropna(inplace=True) p_ut.save_df(zperf, '~/my_tmp/zperf_' + ts_name + '_' + str(cutoff_date.date())) nf.reset_index(inplace=True) p_ut.save_df(nf, '~/my_tmp/nf') fout = nf.merge(zperf, on=list(nf.columns), how='left') return actuals_df, fout
np.sum }).reset_index() g_df['cx_yhat'] = g_df['w_ib_aht'] / g_df[ 'IB Offered'] if 'inbound' in ts_name else g_df['w_ob_aht'] / g_df[ 'OB Offered'] else: print('invalid ts_name: ' + str(ts_name)) # language level errors weekly wf = g_df.merge(a_df, on=['ds_week_ending', 'language'], how='inner') wf['cx_err'] = 2.0 * np.abs(wf['cx_yhat'] - wf['y']) / (wf['cx_yhat'] + wf['y']) wf['ts_name'] = ts_name wf['cutoff_date'] = cutoff_date wf['horizon_date'] = str(horizon_date.date()) p_ut.save_df(wf, '~/my_tmp/cx_fcast_weekly_err_' + ts_name + '_' + cutoff_date) # summary by language wf['x'] = wf['cx_err'] * wf[w_col] lf = wf.groupby('language').agg({'x': np.sum, w_col: np.sum}).reset_index() lf['cx_err'] = lf['x'] / lf[w_col] all_err = (lf[w_col] * lf['cx_err']).sum() / lf[w_col].sum() adf = pd.DataFrame({'language': ['All'], 'cx_err': [all_err]}) lf = lf[['language', 'cx_err']].copy() fdf = pd.concat([lf, adf], axis=0) fdf['ts_name'] = ts_name fdf['cutoff_date'] = cutoff_date fdf['horizon_date'] = str(horizon_date.date()) p_ut.save_df(fdf, '~/my_tmp/cx_fcast_err_' + ts_name + '_' + cutoff_date) print('horizon date: ' + str(horizon_date)) print(fdf)
def main(argv): # ########################### # parameters # ########################### time_scale = 'W' # forecasting time scale reset for daily ticket data init_date = pd.to_datetime('2016-01-01') upr = 12 lwr = 8 evals = 50 by_lang = False # ########################### # ########################### # ########################### print(argv) if len(argv[1:]) == 1: ts_name = argv[-1] cutoff_date = pd.to_datetime('today') to_table = False elif len(argv[1:]) == 2: ts_name, cutoff_date = argv[1:] try: cutoff_date = pd.to_datetime(cutoff_date) to_table = False except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv)) sys.exit() elif len(argv[1:]) == 3: ts_name, cutoff_date, to_table = argv[1:] try: cutoff_date = pd.to_datetime(cutoff_date) to_table = bool(int(to_table)) except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv)) sys.exit() else: s_ut.my_print( 'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv)) sys.exit() ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale) # actuals actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols) actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True) # forecasts f_df = fp.get_lang_fcast(ts_cfg, cutoff_date) fcast_date = cutoff_date + pd.to_timedelta(upr, unit=time_scale) perf_list = list() for xens in [ 'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor', 'GradientBoostingRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor', 'lasso' ]: fcast_df = ep.make_fcast(ts_name, f_df, actuals_df, cutoff_date, fcast_date, xens, evals, by_lang, (lwr, upr), lwr=lwr, upr=upr) perf_df = perf.fcast_perf(fcast_df, actuals_df, cutoff_date, lwr, upr, time_scale, xens) if perf_df is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: forecast performance detail failed for ' + ts_name + ' ,cutoff date ' + str(cutoff_date.date()) + ' and ensemble: ' + str(xens)) else: perf_df['ts_name'] = ts_name perf_list.append(perf_df) if len(perf_list) > 0: pf = pd.concat(perf_list, axis=0) p_ut.save_df( pf, '~/my_tmp/perf/fcast_perf_detail_' + ts_name + '_' + str(cutoff_date.date())) if to_table is True: tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens'] partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name} ret = hql.to_tble(pf, tab_cols, 'sup.cx_language_forecast_performance_detail', partition) if ret != 0: s_ut.my_print( 'pid: ' + str(os.getpid()) + ' ERROR: forecast performance detail failed for ' + ts_name + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() print('DONE') else: s_ut.my_print( 'pid: ' + str(os.getpid()) + ' ERROR: no data for forecast performance detail failed for ' + ts_name + ' and cutoff date ' + str(cutoff_date.date())) sys.exit()
def main(argv): print(argv) if len(argv) == 2: p_name = argv[ 1] # at least 3 days after last Saturday with actual data if 'not-' in p_name: print('invalid series name: ' + str(argv)) sys.exit() else: print('invalid args: ' + str(argv)) sys.exit() # ########################### # parameters # ########################### lwr = 9 upr = 12 time_scale = 'W' # forecasting time scale reset for daily ticket data init_date = pd.to_datetime('2016-01-01') froot = os.path.expanduser('~/my_tmp/fbp/') # max_int = 2 if time_scale == 'W' else (5 if time_scale == 'D' else None) # if max_int is None: # s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: unsupported time scale: ' + str(time_scale)) # sys.exit() # ########################### # ########################### # set complementary TS vname = p_name.split('_') # eg ts, bu if len(vname) == 2: b_name = vname[0] bu = vname[1] n_name = b_name + '_not-' + bu ts_list = [p_name, n_name, b_name] elif len(vname) == 1: ts_list = [p_name] b_name, n_name = None, None else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: invalid ts name: ' + p_name) sys.exit() # forecasts fcast_df = get_fcasts(ts_list, froot) p_ut.save_df(fcast_df, '~/my_tmp/fcast_df') if fcast_df is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no fcast data for ' + p_name) sys.exit() else: # actuals a_df = get_actuals(ts_list, init_date, time_scale) if a_df is not None: ts_adf = a_df[a_df['ts_name'] == p_name].copy() p_ut.save_df(ts_adf, '~/my_tmp/ts_adf') p_list = list() for cu, f_cu in fcast_df.groupby('cutoff'): fcast_date = f_cu['ds'].max() cu -= pd.to_timedelta( (1 + cu.weekday()) % 7, unit='D') # move cutoff to week starting Sunday # performance bounds upr_horizon = min(fcast_date, cu + pd.to_timedelta(upr, unit=time_scale)) lwr_horizon = cu + pd.to_timedelta(lwr, unit=time_scale) f_cu.drop(['cutoff', 'ts_name'], axis=1, inplace=True) m_fcast = merge_neg(f_cu, b_name, p_name, n_name, 'yhat') m_fcast.drop_duplicates(inplace=True) perf_df = fcast_perf(m_fcast, ts_adf, lwr_horizon, upr_horizon) perf_df['ts_name'] = p_name perf_df['cutoff'] = cu p_list.append(perf_df) if len(p_list) > 0: perf_out = pd.concat(p_list, axis=0) fname = froot + 'ens_perf_' p_ut.save_df(perf_out, fname + p_name) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: perf_df not available') else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no actuals data for ' + p_name) sys.exit() print('DONE')
def main(argv): # ########################### # parameters # ########################### time_scale = 'W' # forecasting time scale reset for daily ticket data init_date = pd.to_datetime('2016-01-01') froot = '~/my_tmp/fbp/' # ########################### # ########################### print(argv) if len(argv) == 2: ts_name = argv[-1] to_table = False run_date = pd.to_datetime('today') elif len(argv) == 3: ts_name, run_date = argv[-2:] try: run_date = pd.to_datetime(run_date) to_table = False except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() elif len(argv) == 4: ts_name, run_date, to_table = argv[1:] try: run_date = pd.to_datetime(run_date) to_table = bool(int(to_table)) except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() else: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() # data cfg cutoff_date = tm_ut.get_last_sat( run_date ) # set to last saturday before run_date or the run_date if a saturday ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale) FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0]) fcast_days = ts_cfg.get('fcast_days', None) if fcast_days is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR" fcast_days must be specified in data_cfg') sys.exit() else: fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D') if time_scale == 'W': fcast_date = fcast_date - pd.to_timedelta( 1 + fcast_date.weekday(), unit='D') # set to week starting Sunday cu = cutoff_date - pd.to_timedelta( 1 + cutoff_date.weekday(), unit='D') # set to week starting Sunday fcast_days = (fcast_date - cu).days # multiple of 7 upr_horizon = int(fcast_days / 7) # in time scale units elif time_scale == 'D': upr_horizon = int(fcast_days) # in time scale units else: s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' + str(time_scale)) sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' ------------------------ start language forecast for ' + str(ts_name) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------') # get actuals actuals_df = dp.ts_actuals( ts_name, ts_cfg, cols) # may have data past cutoff for accuracy checking if actuals_df['ds'].max() < cutoff_date: s_ut.my_print( 'ERROR: no actuals available for forecast from cutoff date: ' + str(cutoff_date.date())) sys.exit() f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy( ) # actuals for forecast: only use up to cutoff date # adjust FCAST_DICT if len(FCAST_DICT['do_res']) == 2: # True, False FCAST_DICT['do_res'] = [ True ] # MUST overwrite: the False care is always included and otherwise we double count. if len(ts_cfg.get('regressors', list())) == 0: FCAST_DICT['r_mode'] = [None] reg_dict = dict() else: reg_dict = regs.ens_fcast( ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days, init_date, f_actuals_df) # stored by cutoff date on last Sat of the month # update init_date init_date = max([f_actuals_df['ds'].min()] + [f['ds'].min() for f in reg_dict.values()]) f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy() reg_dict = { lx: f[f['ds'] >= init_date].copy() for lx, f in reg_dict.items() } ts_cfg['init_date'] = init_date # set the list of fcast cfgs tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date, time_scale) # list of fcast cfg's fix_pars = [ f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg, time_scale, upr_horizon ] arg_list = [ fix_pars + [tlist[ix]] for ix in range(len(tlist)) ] # 2 fcasts are done per input cfg (do_res = true and do_res = false) n_fcfg = 2 * len(arg_list) s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' + str(n_fcfg) + ' fcast configs per language **********') # ############################################################################### # ############################################################################### # ############################################################################### if is_test: df_list_ = s_ut.do_mp(fcast_lang, arg_list, is_mp=False, cpus=None, do_sigkill=True) else: df_list_ = s_ut.do_mp(fcast_lang, arg_list, is_mp=True, cpus=None, do_sigkill=True) # ############################################################################### # ############################################################################### # ############################################################################### # join all the fcasted data into a flat list df_list = [f for f in df_list_ if f is not None] if len(df_list) > 0: ylist, alist = list(), list() for fl in df_list: if fl is not None: fl = set_cfg(fl.copy(), CFG_COLS) ylist.append(fl[[ 'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg', 'fcast_date' ]].copy()) alist.append(fl) # save basic fcast data fcast_df = pd.concat( ylist, axis=0) # now all the list elements have the same columns fcast_df.reset_index(inplace=True, drop=True) ok_cfg = fcast_df['dim_cfg'].unique() s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) + ' forecasts cfgs available for ' + str(ts_name) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------') # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy() fname = froot + 'lang_fcast_' p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date())) if to_table is True: tab_cols = ['ds', 'language', 'dim_cfg', 'yhat'] partition = { 'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key'] } ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast', partition) if ret != 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts loaded to table for ' + str(ts_cfg['ts_key']) + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() # save all fcast data (y_upr, y_lwr, ...) all_df = pd.concat( alist, axis=0) # now all the list elements have the same columns all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS])) all_df.reset_index(inplace=True, drop=True) all_df = all_df[all_cols].copy() all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy() fname = froot + 'fcast_all_' p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date())) if to_table is True: all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True) mf = pd.melt(all_df, id_vars=['ds', 'language', 'dim_cfg'], var_name='key', value_name='value') mf.dropna(subset=['value'], inplace=True) mf = mf[mf['value'] != 0.0].copy() partition = { 'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key'] } ret = hql.to_tble(mf, list(mf.columns), 'sup.cx_language_forecast_detail', partition) if ret != 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts loaded to table for ' + str(ts_cfg['ts_key']) + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() print('DONE') else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts available for ' + str(ts_cfg['ts_key']) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------')
def to_excel_(xl_file, cutoff_date_, curr_adj_obj, prev_adj_obj, target_year, lang_errs, tier_errs, lact_df): """ build excel output and save it to xls file :param xl_file: out file :param cutoff_date_: current cutoff date :param curr_adj_obj: current fcast adjusted obj :param prev_adj_obj: prior fcast adjusted obj :param lang_errs: lang errs of the fcast 3 months back :param tier_errs: tier errs between the fcast 3 months back :param lact_df: lang level actuals 16 weeks back :return: _ """ # prepare excel output: xls output is adjusted. par output is Not fx = os.path.expanduser(xl_file) s_ut.my_print('xl file: ' + fx) xlwriter = pd.ExcelWriter(fx, engine='xlsxwriter') # pull out actuals date indicator act_df = curr_adj_obj.data[['ds', 'is_actual']].copy() act_df['ds'] = act_df['ds'].dt.date.astype(str) act_df.set_index('ds', inplace=True) p_ut.save_df(act_df, '~/my_tmp/act_df') act_dates = list(act_df[act_df['is_actual'] == 1].index) curr_fcast = curr_adj_obj.data # current forecast prev_fcast = prev_adj_obj.data # previous forecast # fcast delta fcast_delta(xlwriter, curr_fcast, prev_fcast, 'Forecast Delta') for bu in ['Homes', 'Experiences', 'China']: c_fcast = curr_fcast[curr_fcast['business_unit'] == bu].copy() p_fcast = prev_fcast[prev_fcast['business_unit'] == bu].copy() fcast_delta(xlwriter, c_fcast, p_fcast, bu + ' Forecast Delta') # target year totals by language-tier-channel fcast_totals(xlwriter, curr_fcast, target_year, ' Totals') for bu in ['Homes', 'Experiences', 'China']: c_fcast = curr_fcast[curr_fcast['business_unit'] == bu].copy() c_fcast = c_fcast[(c_fcast['ds'] >= '2019-12-28') & (c_fcast['ds'] <= '2021-01-03')].copy() fcast_totals(xlwriter, c_fcast, target_year, ' ' + bu + ' Totals') # language errors wrt actuals if lang_errs is not None: lang_errs.fillna(0, inplace=True) # lang_errs.rename(columns={'language': 'language'}, inplace=True) lang_errs['actual_count'] = np.round(lang_errs['actual_count'], 0) lang_errs['forecasted_count'] = np.round(lang_errs['forecasted_count'], 0) lang_errs.to_excel(xlwriter, 'Language Accuracy', index=False) workbook = xlwriter.book bold = workbook.add_format({'bold': True}) worksheet = xlwriter.sheets['Language Accuracy'] worksheet.set_column('A:ZZ', 20) worksheet.set_row(0, None, bold) format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'}) worksheet.conditional_format(0, 0, 0, len(lang_errs.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'}) else: s_ut.my_print('No data for ' + 'language accuracy') # tier errors wrt actuals if tier_errs is not None: tier_errs.fillna(0, inplace=True) # tier_errs.rename(columns={'language': 'language', 'service_tier': 'service tier'}, inplace=True) tier_errs['actual_count'] = np.round(tier_errs['actual_count'], 0) tier_errs['forecasted_count'] = np.round(tier_errs['forecasted_count'], 0) tier_errs.to_excel(xlwriter, 'Tier Accuracy', index=False) workbook = xlwriter.book bold = workbook.add_format({'bold': True}) worksheet = xlwriter.sheets['Tier Accuracy'] worksheet.set_column('A:ZZ', 20) worksheet.set_row(0, None, bold) format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'}) worksheet.conditional_format(0, 0, 0, len(tier_errs.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'}) else: s_ut.my_print('No data for ' + 'tier accuracy') # channel mix (not very practical to transpose dates as there are 4 values per date) cdf = curr_adj_obj.data[curr_adj_obj.data['ds'] > cutoff_date_].copy() if cdf is not None: cdf['ds'] = cdf['ds'].dt.date.astype(str) cdf['ticket_count'] = np.round(cdf['ticket_count'].astype(float).values, 0) p_ut.save_df(cdf, '~/my_tmp/cdf') channels = cdf['channel'].unique() pmix_df = pd.pivot_table(cdf, index=['business_unit', 'language', 'service_tier', 'ds'], columns='channel', values='ticket_count') qmix = pmix_df.div(pmix_df.sum(axis=1), axis=0).reset_index() qmix.fillna(0, inplace=True) for c in channels: qmix[c] = qmix[c].apply(lambda x: np.round(100.0 * x, 1) if isinstance(x, (float, np.float64, int, np.int64)) else x) dcol = {c: c + '(%)' for c in channels} qmix.rename(columns=dcol, inplace=True) qmix.to_excel(xlwriter, 'Channel Mix', index=False) workbook = xlwriter.book bold = workbook.add_format({'bold': True}) worksheet = xlwriter.sheets['Channel Mix'] worksheet.set_column('A:ZZ', 20) worksheet.set_row(0, None, bold) format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'}) worksheet.conditional_format(0, 0, 0, len(qmix.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'}) else: s_ut.my_print('No data for ' + 'channel mix') # language actuals if lact_df is not None: lact_df['actual_count'] = np.round(lact_df['actual_count'].astype(float).values, 0) # language level actuals lact_df['ds'] = lact_df['ds'].dt.date lact_df.to_excel(xlwriter, 'Language Level Actuals', index=False) workbook = xlwriter.book bold = workbook.add_format({'bold': True}) worksheet = xlwriter.sheets['Language Level Actuals'] worksheet.set_column('A:Z', 20) worksheet.set_row(0, None, bold) format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'}) worksheet.conditional_format(0, 0, 0, len(lact_df.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'}) else: s_ut.my_print('No data for ' + ' language level actuals') # add the rest of the tabs if curr_adj_obj.data is not None: counts_dict = rs_to_excel(curr_adj_obj.data, 'ds') for ky, sht in counts_dict.items(): sht.fillna(0, inplace=True) sht = sht.applymap(lambda x: np.round(x, 0) if isinstance(x, (float, np.float64, int, np.int64)) else x) tab_name = ky.replace('/', '-') sht.to_excel(xlwriter, tab_name, index=False) set_colors(xlwriter, tab_name, sht, act_dates) workbook = xlwriter.book bold = workbook.add_format({'bold': True}) worksheet = xlwriter.sheets[tab_name] worksheet.set_column('B:ZZ', 20, bold) worksheet.set_column('A:A', 20, bold) else: s_ut.my_print('No data for ' + 'forecasts') s_ut.my_print('saving xls forecast to ' + fx) xlwriter.save()
# ######################### actuals_weeks = 12 target_year = 2020 init_date = pd.to_datetime('2016-01-01') time_scale = 'W' ts_name = 'ticket_count' # ######################### # get actuals ts_cfg, _ = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale) adf = dp.ts_actuals(ts_name, ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'], drop_cols=False, use_cache=False) start, end = tm_ut.iso_dates(target_year - 1) adf = adf[adf['ds'] >= start] # use for YoY comparison adf.rename(columns={'ticket_count': 'y'}, inplace=True) p_ut.save_df(adf, '~/my_tmp/a_df') # current forecast s_ut.my_print('current forecast') cf_df = pd.concat([get_fcast(ts_name, ts_cfg, cutoff_date, bu, use_cache=False) for bu in ts_cfg['business_units']], axis=0) cf_df = cf_df[cf_df['ds'] > cutoff_date].copy() p_ut.save_df(cf_df, '~/my_tmp/cf_df') # target year summary xl_ut.year_summary(adf, cf_df, target_year, cutoff_date) # get 90 day old forecast (last Sat of a month) s_ut.my_print('3 months old forecast') date = pd.to_datetime(str(cutoff_date.year) + '-' + str(cutoff_date.month) + '-01') cu90 = tm_ut.last_saturday_month(date - pd.to_timedelta(3, unit='M')) f90_df = pd.concat([get_fcast(ts_name, ts_cfg, cu90, bu, use_cache=False) for bu in ts_cfg['business_units']], axis=0)