def lasso_vble_sel_results(fdf, a_ddf, cutoff_date_, normalize, reg_mdl): # find an initial cfg with lasso and improve aic with variable selection n_lang = fdf['language'].nunique() t_fdf = fdf[fdf['ds'] <= cutoff_date_].copy() t_adf = a_ddf[a_ddf['ds'] <= cutoff_date_][['ds', 'language', 'y']].copy() arg_list = [[l, lf, t_adf[t_adf['language'] == l].copy(), normalize] for l, lf in t_fdf[['ds', 'yhat', 'cfg_idx', 'language']].groupby('language')] d_list = s_ut.do_mp(lasso_selection, arg_list, is_mp=True, cpus=n_lang, do_sigkill=True) lasso_df = pd.DataFrame(d_list) lasso_df.set_index('language', inplace=True) lasso_dict = lasso_df.to_dict(orient='index') # {lang: {'alpha': ..., 'cfg_list': [...]}, ..} # improve best lasso cfg by variable selection starting from initial Lasso cfg arg_list = process_mdl_args(fdf.copy(), a_ddf.copy(), cutoff_date_, lasso_dict, reg_mdl) f_arg_list = [['fwd'] + a for a in arg_list] f_list_ = s_ut.do_mp(variable_selection, f_arg_list, is_mp=True, cpus=n_lang, do_sigkill=True) # list of lists f_list = [d for dl in f_list_ for d in dl if d is not None] # flatten the list b_arg_list = [['bwd'] + a for a in arg_list] b_list_ = s_ut.do_mp(variable_selection, b_arg_list, is_mp=True, cpus=n_lang, do_sigkill=True) # list of lists b_list = [d for dl in b_list_ for d in dl if d is not None] # flatten the list scores_df = pd.DataFrame(f_list + b_list) scores_df.dropna(inplace=True) scores_df.reset_index(inplace=True, drop=True) return scores_df
def ens_fcast(fdf, adf, cutoff_date, g_cols, b_cols, normalize=True): fdf_idx = t_ut.set_cfg_idx(fdf.copy()) t_start = max(adf['ds'].min(), fdf_idx['ds'].min()) t_fdf = fdf_idx[(fdf_idx['ds'] <= cutoff_date) & (fdf_idx['ds'] >= t_start)].copy() t_adf = adf[(adf['ds'] <= cutoff_date) & (adf['ds'] >= t_start)][['ds', 'ticket_count'] + b_cols + g_cols].copy() v_fdf = fdf_idx[(fdf_idx['ds'] > cutoff_date)].copy() cols = b_cols + g_cols g_adf_dict = {gc: fgc for gc, fgc in t_adf.groupby(cols)} g_fdf_dict = {gc: fgc for gc, fgc in v_fdf.groupby(cols)} arg_list = [[ l, lf, g_adf_dict.get(l, None), g_fdf_dict.get(l, None), 'ticket_count', cols, normalize ] for l, lf in t_fdf[['ds', 'ticket_count', 'cfg_idx'] + cols].groupby(cols)] d_list = s_ut.do_mp(t_ut.lasso_selection, arg_list, is_mp=True, cpus=None, do_sigkill=True) f_all = pd.concat([d['res'] for d in d_list], axis=0) f_all.dropna(inplace=True) f_all = f_all[f_all['y_pred'] > 0] f_all['y_pred'] = np.round(f_all['y_pred'].values, 0) return f_all
def prepare_regressors(data_cfg, _cfg, d_cfg, cutoff_date, fcast_days, init_date='2016-01-01'): s_ut.my_print('************* reading regressors ********************') reg_cfg = data_cfg.get('regressors', None) if reg_cfg is None: return None arg_list = [[rname, rcfg, cutoff_date, fcast_days, init_date] for rname, rcfg in reg_cfg.items()] rf_list = s_ut.do_mp(prepare_regs, arg_list, is_mp=True, cpus=None, do_sigkill=True) arg_list, rcol_list = fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days, pd.to_datetime(init_date)) r_list = s_ut.do_mp(fcast_regressors, arg_list, is_mp=True, cpus=None, do_sigkill=True) r_list = list(filter(lambda x: x is not None, r_list)) # drop all Nones if any reg_fdf = merge_regressors( r_list, rcol_list) # merge all regressors in a single DF fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D') if reg_fdf is not None: p_ut.save_df(reg_fdf, '~/my_tmp/reg_df') s_ut.my_print('final predicted regressors: fcast date: ' + str(fcast_date.date()) + ' cutoff rate: ' + str(cutoff_date.date()) + ' fcast_days: ' + str(fcast_days) + ' gap: ' + str( max([ reg_fdf[reg_fdf['language'] == l] ['ds'].diff().dt.days.max() for l in reg_fdf['language'].unique() ])) + ' nulls: ' + str(sum([reg_fdf[c].isnull().sum() for c in rcol_list]))) else: s_ut.my_print('WARNING: no regressors available') return reg_fdf
def best_regression_cfg(X_train, y_train, X_test, y_test, y_perf, n_good, topN_list, obj_list, used_cpus=0): # n_good: nbr of 'good' AdaBoost cfgs to avg on # obj_list: loss functions to apply to y col actual and predicted ab_cols = [ 'adb_estimators', 'max_depth', 'learning_rate', 'loss', 'min_samples_split' ] # AdaBoost cfg estimators_list = [25, 50, 100, 200] depth_list = [4, 8, 12, 16] learn_rate = [0.5, 1, 1.5, 2] min_samples_split_list = [2, 4, 8, 12] loss_list = ['linear', 'square', 'exponential'] ab_cfgs = itertools.product(estimators_list, depth_list, learn_rate, min_samples_split_list, loss_list) ab_cfgs = [ list(x) + [X_train, y_train, X_test, y_test, y_perf, topN_list] for x in ab_cfgs ] f_list_ = s_ut.do_mp(ab_func, ab_cfgs, is_mp=True, cpus=None, do_sigkill=True, verbose=False, used_cpus=used_cpus) f_list = [x for l in f_list_ for x in l if len(x) > 0] f = pd.DataFrame([d for d in f_list if len(d) > 0]) d_list = list() for obj in obj_list: # for each obj func choose the best regressor fad = f.nsmallest(n=n_good, columns=[obj]) if np.isinf(fad[obj].min()) is True: continue d_adb = {c: fad[c].mode().values[0] for c in ab_cols} # take the most common among the top n_good d_adb['topN'] = fad['topN'].mode().values[0] d_adb['obj'] = obj d_list.append(d_adb) return d_list
def prepare_regressors(data_cfg, _cfg, d_cfg, cutoff_date, fcast_days, int_type, init_date='2016-01-01'): s_ut.my_print('************* reading regressors ********************') reg_cfg = data_cfg.get('regressors', None) if reg_cfg is None: return None init_date = pd.to_datetime(init_date) arg_list = [[rname, rcfg, cutoff_date, fcast_days, int_type, init_date] for rname, rcfg in reg_cfg.items()] obj_list = s_ut.do_mp( prepare_regs, arg_list, is_mp=True, cpus=None, do_sigkill=True) # returns the list of regressor obj's reg_fdf = regressors.Regressor.merge_regressors( obj_list) # merge all regressors in a single DF if reg_fdf is None: s_ut.my_print('WARNING: no regressors available') return reg_fdf
def main(ts_name, cutoff_date, cfg_cols, to_db=True, df_cfg=None, is_mp=True): #, is_fcast=True): cfg_file = get_fcast_cfg_file() with open(os.path.expanduser(cfg_file), 'r') as fp: d_cfg = json.load(fp) # if is_fcast is False and df_cfg is None: # s_ut.my_print('ERROR: cannot generate ensemble fcasts without fcast configs') # sys.exit() perf_df = df_cfg.copy() if df_cfg is not None else get_fcast_cfg( ts_name, cutoff_date) # if is_fcast is True and perf_df is None: # s_ut.my_print('ERROR: cannot forecast without fcast configs') # sys.exit() if_exists = d_cfg['if_exists'] upr_horizon, lwr_horizon = d_cfg['upr_horizon_days'], d_cfg[ 'lwr_horizon_days'] # ################################## # ################################## # if lang == 'Mandarin_Onshore': # p_df['avg'] = p_df.mean(axis=1) # p_df.sort_values(by='avg', inplace=True) # print('lang: ' + str(lang)) # print(p_df.head(1)) # cfg_dict[lang] = [p_df.index[0]] # ################################## # ################################## # set up (ts, regressors, ...) ts_obj, reg_dict, cfg_dict, _ = dtp.initialize(cfg_file, cutoff_date, ts_name, False, is_mp=is_mp, init_date='2016-01-01') # get fcasts fcast_list = list() ctr = 0 for l, l_df in ts_obj.df_dict.items(): # by language # if l != 'English_NA': # continue ctr += 1 if ctr > 3: print(99999999999999999999) print('DEBUG @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') break if l_df is None or len(l_df) == 0: s_ut.my_print('WARNING: no data for language: ' + str(l)) continue pl = perf_df[perf_df['language'] == l].copy() if len(pl) == 0: s_ut.my_print('WARNING: no fcast cfg data for language: ' + str(l)) continue ql = pl[cfg_cols].copy() cfgs = ql.to_dict(orient='records') print('\n') s_ut.my_print( '********************************* starting forecast for language: ' + str(l)) _ = [print('++ config: ' + str(d)) for d in cfgs] arg_list = dtp.prophet_prep(ts_obj, l, reg_dict.get(l, None), cfg_dict, upr_horizon, lwr_horizon, cfgs, False) f_list = s_ut.do_mp(dtp.tf, arg_list, is_mp=is_mp, cpus=None, do_sigkill=True) s_ut.my_print( '********************************* actual forecasts completed for language: ' + str(l) + ': ' + str(len(f_list))) # + ' is_fcast: ' + str(is_fcast)) if len(f_list) > 0: fl = cfg_fcast( f_list, pl, cfg_cols ) # if is_fcast is False else actual_fcast(f_list, pl, avg_func, cutoff_date) fl['language'] = l fcast_list.append(fl) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecast DF for ' + str(l)) table = 'sup.fct_ds_interaction_based_forecasts' if len(fcast_list) > 0: df_out = pd.concat(fcast_list, axis=0) file_out = to_table(to_db, table, pd.to_datetime(cutoff_date), ts_name, if_exists, df_out) else: s_ut.my_print('ERROR: no forecast data generated') file_out = None return file_out
def main(argv): # ########################### # parameters # ########################### time_scale = 'W' # forecasting time scale reset for daily ticket data init_date = pd.to_datetime('2016-01-01') froot = '~/my_tmp/fbp/' # ########################### # ########################### print(argv) if len(argv) == 2: ts_name = argv[-1] to_table = False run_date = pd.to_datetime('today') elif len(argv) == 3: ts_name, run_date = argv[-2:] try: run_date = pd.to_datetime(run_date) to_table = False except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() elif len(argv) == 4: ts_name, run_date, to_table = argv[1:] try: run_date = pd.to_datetime(run_date) to_table = bool(int(to_table)) except ValueError: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() else: s_ut.my_print( 'ERROR: invalid arguments (ts_name, run_date, to_table): ' + str(argv)) sys.exit() # data cfg cutoff_date = tm_ut.get_last_sat( run_date ) # set to last saturday before run_date or the run_date if a saturday ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale) FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0]) fcast_days = ts_cfg.get('fcast_days', None) if fcast_days is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR" fcast_days must be specified in data_cfg') sys.exit() else: fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D') if time_scale == 'W': fcast_date = fcast_date - pd.to_timedelta( 1 + fcast_date.weekday(), unit='D') # set to week starting Sunday cu = cutoff_date - pd.to_timedelta( 1 + cutoff_date.weekday(), unit='D') # set to week starting Sunday fcast_days = (fcast_date - cu).days # multiple of 7 upr_horizon = int(fcast_days / 7) # in time scale units elif time_scale == 'D': upr_horizon = int(fcast_days) # in time scale units else: s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' + str(time_scale)) sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' ------------------------ start language forecast for ' + str(ts_name) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------') # get actuals actuals_df = dp.ts_actuals( ts_name, ts_cfg, cols) # may have data past cutoff for accuracy checking if actuals_df['ds'].max() < cutoff_date: s_ut.my_print( 'ERROR: no actuals available for forecast from cutoff date: ' + str(cutoff_date.date())) sys.exit() f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy( ) # actuals for forecast: only use up to cutoff date # adjust FCAST_DICT if len(FCAST_DICT['do_res']) == 2: # True, False FCAST_DICT['do_res'] = [ True ] # MUST overwrite: the False care is always included and otherwise we double count. if len(ts_cfg.get('regressors', list())) == 0: FCAST_DICT['r_mode'] = [None] reg_dict = dict() else: reg_dict = regs.ens_fcast( ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days, init_date, f_actuals_df) # stored by cutoff date on last Sat of the month # update init_date init_date = max([f_actuals_df['ds'].min()] + [f['ds'].min() for f in reg_dict.values()]) f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy() reg_dict = { lx: f[f['ds'] >= init_date].copy() for lx, f in reg_dict.items() } ts_cfg['init_date'] = init_date # set the list of fcast cfgs tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date, time_scale) # list of fcast cfg's fix_pars = [ f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg, time_scale, upr_horizon ] arg_list = [ fix_pars + [tlist[ix]] for ix in range(len(tlist)) ] # 2 fcasts are done per input cfg (do_res = true and do_res = false) n_fcfg = 2 * len(arg_list) s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' + str(n_fcfg) + ' fcast configs per language **********') # ############################################################################### # ############################################################################### # ############################################################################### if is_test: df_list_ = s_ut.do_mp(fcast_lang, arg_list, is_mp=False, cpus=None, do_sigkill=True) else: df_list_ = s_ut.do_mp(fcast_lang, arg_list, is_mp=True, cpus=None, do_sigkill=True) # ############################################################################### # ############################################################################### # ############################################################################### # join all the fcasted data into a flat list df_list = [f for f in df_list_ if f is not None] if len(df_list) > 0: ylist, alist = list(), list() for fl in df_list: if fl is not None: fl = set_cfg(fl.copy(), CFG_COLS) ylist.append(fl[[ 'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg', 'fcast_date' ]].copy()) alist.append(fl) # save basic fcast data fcast_df = pd.concat( ylist, axis=0) # now all the list elements have the same columns fcast_df.reset_index(inplace=True, drop=True) ok_cfg = fcast_df['dim_cfg'].unique() s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) + ' forecasts cfgs available for ' + str(ts_name) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------') # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy() fname = froot + 'lang_fcast_' p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date())) if to_table is True: tab_cols = ['ds', 'language', 'dim_cfg', 'yhat'] partition = { 'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key'] } ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast', partition) if ret != 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts loaded to table for ' + str(ts_cfg['ts_key']) + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() # save all fcast data (y_upr, y_lwr, ...) all_df = pd.concat( alist, axis=0) # now all the list elements have the same columns all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS])) all_df.reset_index(inplace=True, drop=True) all_df = all_df[all_cols].copy() all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy() fname = froot + 'fcast_all_' p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date())) if to_table is True: all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True) mf = pd.melt(all_df, id_vars=['ds', 'language', 'dim_cfg'], var_name='key', value_name='value') mf.dropna(subset=['value'], inplace=True) mf = mf[mf['value'] != 0.0].copy() partition = { 'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key'] } ret = hql.to_tble(mf, list(mf.columns), 'sup.cx_language_forecast_detail', partition) if ret != 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts loaded to table for ' + str(ts_cfg['ts_key']) + ' and cutoff date ' + str(cutoff_date.date())) sys.exit() print('DONE') else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no forecasts available for ' + str(ts_cfg['ts_key']) + ' from cutoff date ' + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + ' (included) -----------------------')
def cfg_selection(ts_name, cutoff_date, cfg_cols, p_col): # finds the best forecast cfgs for each language df = get_cfg_data(ts_name, cfg_cols, p_col) # read all the cfgs and set the cfg_idx if df is None or len(df) == 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data return') s_ut.my_print('pid: ' + str(os.getpid()) + ': ERROR') sys.exit() # prepare df for cfg regression: only numerical vals in the cols df['h_mode'] = df['h_mode'].astype(int) df['do_res'] = df['do_res'].astype(int) # set the cutoff dates as increasing integers for the regression s = pd.Series(df['cutoff'].unique()) s.sort_values(inplace=True) z = s.dt.to_period('M').diff().apply(lambda x: 0 if pd.isna(x) else x.n) fx = pd.DataFrame({'cutoff': s, 'n_ds': z.cumsum()}) df = df.merge(fx, on='cutoff', how='left') df = set_rank( df, p_col) # set the rank of each cfg for each language and cutoff_date q_list = [0.1, 1.0, 0.25] y_list = [p_col, 'rank_log', 'rank', 'rank_logistic'] x_list = ['box-cox', None, 'logistic'] o_list = ['a_loss', 'w_loss', 'p_loss', 'r_loss'] n_list = [10, 15, 20] m_list = [3, 4] cpus = len(q_list) * len(y_list) * len(x_list) * len(m_list) arg_list_ = itertools.product(q_list, y_list, x_list, m_list) arg_list = [ list(x) + [o_list, n_list, p_col, cutoff_date, df.copy(), cpus] for x in arg_list_ ] f_list = s_ut.do_mp(cfg_selection_, arg_list, is_mp=True, cpus=None, do_sigkill=True, verbose=False) try: f_sel = pd.concat([x for x in f_list if x is not None]) except ValueError as e: s_ut.my_print('ERROR: No data returned: ' + str(e)) sys.exit() f_sel['cfg_idx'] = f_sel['cfg_idx'].apply( lambda x: json.dumps([int(y) for y in x[0]]) if isinstance(x, list) and len(x) == 1 else np.nan) f_sel.dropna(subset=['cfg_idx'], inplace=True) update_df(f_sel, ts_name, cutoff_date) f_sel.drop_duplicates(inplace=True) f_sel['cfg_idx'] = f_sel['cfg_idx'].apply( lambda x: json.loads(x) if isinstance(x, str) else np.nan) f_sel.dropna(subset=['cfg_idx'], inplace=True) out_dir = '~/my_tmp/cfg_sel/' p_ut.save_df(f_sel, out_dir + 'cfg_sel_' + ts_name + '_' + cutoff_date) # select the best for each language f_best = f_sel.groupby( 'language').apply(lambda x: x[x[p_col] == x[p_col].min()]).reset_index( drop=True) # select by p_col: optimal but unknown f_best = f_best[[ 'xform', 'topN', 'obj', 'yobj', 'cfg_idx', 'f_err', 'f_err_max', 'qtile', 'language', 'ts_name', 'cutoff' ]].copy() p_ut.save_df(f_best, out_dir + 'cfg_best_' + ts_name + '_' + cutoff_date) f_idx = df[['ts_name', 'language', 'cfg_idx', p_col, 'cfg_str'] + cfg_cols].copy() f_idx['cutoff'] = cutoff_date p_ut.save_df(f_idx, out_dir + 'cfg_idx_' + ts_name + '_' + cutoff_date) s_ut.my_print('pid: ' + str(os.getpid()) + ': DONE')
def main(argv): print('usage: python lang_fcast.py <ts_name> <cutoff_date>') print(argv) ts_name, cutoff_date = argv this_file = os.path.basename(__file__) cfg_dir = '/'.join(FILE_PATH.split('/')[:-1]) cfg_file = os.path.join(cfg_dir, 'config/' + this_file[:-3] + '_cfg.json') # validate the data, prepare regressors, holidays DF ts_obj, reg_dict, cfg_dict, train_days = dtp.initialize( cfg_file, cutoff_date, ts_name, True, init_date='2016-01-01') upr_horizon, lwr_horizon = cfg_dict['upr_horizon_days'], cfg_dict[ 'lwr_horizon_days'] if_exists = cfg_dict['if_exists'] cutoff_date = ts_obj.cutoff_date out_list = list() cu = cutoff_date + pd.to_timedelta( upr_horizon, unit='D') # actual cutoff date for training ds = str(cu.date()) # ctr = 0 # train_days = [25, 35] for l, t_df in ts_obj.df_dict.items(): # if l != 'Mandarin': # continue s_ut.my_print( '\n\n****************************** starting language: ' + str(l)) lang_list = list() if t_df is not None: for tdays in train_days: tlist = dtp.get_f_cfgs(t_df, l, cutoff_date, tdays, upr_horizon, cfg_dict, is_train=True) if tlist is None: s_ut.my_print('WARNING: language ' + str(l) + ' and training cutoff date ' + str(cutoff_date.date()) + ' and training days ' + str(tdays) + ' has NO fcast configs') continue else: arg_list = dtp.prophet_prep(ts_obj, l, reg_dict.get(l, None), cfg_dict, upr_horizon, lwr_horizon, tlist, True) s_ut.my_print('pid: ' + str(os.getpid()) + ' ************* forecasts for ' + str(l) + ' with ' + str(tdays) + ' train days and ' + str(len(arg_list)) + ' configs') f_list = s_ut.do_mp(dtp.tf, arg_list, is_mp=True, cpus=len(arg_list), do_sigkill=True) if f_list is None: s_ut.my_print('pid: ' + str(os.getpid()) + ': No results with ' + str(tdays) + ' training days') f_list = list() else: s_ut.my_print('pid: ' + str(os.getpid()) + ': ' + str(len(f_list)) + ' results with ' + str(tdays) + ' training days') # save the fcast configs if len(f_list) > 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' concatenating ' + str(len(f_list)) + ' DFs for ' + str(l)) l_df = pd.concat([f for f in f_list], axis=0) l_df['language'] = l s_ut.my_print('pid: ' + str(os.getpid()) + ' Language ' + str(l) + ' has ' + str(len(l_df)) + ' fcast cfgs with ' + str(tdays) + ' training days') l_df.reset_index(inplace=True, drop=True) l_df[ 'ds'] = ds # here we only save cfg's not fcasts. Use ds for partition l_df['ts_name'] = ts_name l_df['cutoff'] = ds lang_list.append(l_df) out_list.append(l_df) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: no DF for ' + str(l)) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: no training DF for ' + str(l)) # ctr += 1 # if ctr >= 2: # break if len(lang_list) > 0: # save language level results fl = pd.concat(lang_list, axis=0) p_ut.save_df( fl, '~/my_tmp/fcast_cfg_v2_' + ds + '_' + ts_name + '_' + l) # all training done or this TS. Save data if len(out_list) > 0: s_ut.my_print('pid: ' + str(os.getpid()) + ' *************** saving training data ***********') df_all = pd.concat(out_list, axis=0) df_all.drop_duplicates(inplace=True) df_all.reset_index(inplace=True, drop=True) df_all['ds'] = ds # here we only save cfg's not fcasts df_all['ts_name'] = ts_name df_all['cutoff'] = ds p_ut.save_df(df_all, '~/my_tmp/fcast_cfg/fcast_cfg_v2_' + ds + '_' + ts_name) df_all.drop(['ds', 'ts_name'], inplace=True, axis=1) # not needed to push partition_ = {'ds': ds, 'ts_name': ts_name} table = 'sup.fct_cx_forecast_config_v3' try: # only hive works with the partition argument with s_ut.suppress_stdout_stderr(): import airpy as ap ap.hive.push(df_all, table=table, if_exists=if_exists, partition=partition_, table_props={ 'abb_retention_days': '-1', 'abb_retention_days_reason': 'fact table. No pii' }) s_ut.my_print('data saved to table ' + table + ' for ' + ts_name + ' and ds ' + ds) print('DONE') except: s_ut.my_print('ERROR: could not save to table ' + table + ' for ' + ts_name) else: s_ut.my_print('ERROR: no output')