Example #1
0
def lasso_vble_sel_results(fdf, a_ddf, cutoff_date_, normalize, reg_mdl):
    # find an initial cfg with lasso and improve aic with variable selection
    n_lang = fdf['language'].nunique()
    t_fdf = fdf[fdf['ds'] <= cutoff_date_].copy()
    t_adf = a_ddf[a_ddf['ds'] <= cutoff_date_][['ds', 'language', 'y']].copy()

    arg_list = [[l, lf, t_adf[t_adf['language'] == l].copy(), normalize]
                for l, lf in t_fdf[['ds', 'yhat', 'cfg_idx', 'language']].groupby('language')]
    d_list = s_ut.do_mp(lasso_selection, arg_list, is_mp=True, cpus=n_lang, do_sigkill=True)
    lasso_df = pd.DataFrame(d_list)
    lasso_df.set_index('language', inplace=True)
    lasso_dict = lasso_df.to_dict(orient='index')   # {lang: {'alpha': ..., 'cfg_list': [...]}, ..}

    # improve best lasso cfg by variable selection starting from initial Lasso cfg
    arg_list = process_mdl_args(fdf.copy(), a_ddf.copy(), cutoff_date_, lasso_dict, reg_mdl)
    f_arg_list = [['fwd'] + a for a in arg_list]
    f_list_ = s_ut.do_mp(variable_selection, f_arg_list, is_mp=True, cpus=n_lang, do_sigkill=True)  # list of lists
    f_list = [d for dl in f_list_ for d in dl if d is not None]                                      # flatten the list
    b_arg_list = [['bwd'] + a for a in arg_list]
    b_list_ = s_ut.do_mp(variable_selection, b_arg_list, is_mp=True, cpus=n_lang, do_sigkill=True)   # list of lists
    b_list = [d for dl in b_list_ for d in dl if d is not None]                                       # flatten the list
    scores_df = pd.DataFrame(f_list + b_list)
    scores_df.dropna(inplace=True)
    scores_df.reset_index(inplace=True, drop=True)
    return scores_df
Example #2
0
def ens_fcast(fdf, adf, cutoff_date, g_cols, b_cols, normalize=True):
    fdf_idx = t_ut.set_cfg_idx(fdf.copy())
    t_start = max(adf['ds'].min(), fdf_idx['ds'].min())
    t_fdf = fdf_idx[(fdf_idx['ds'] <= cutoff_date)
                    & (fdf_idx['ds'] >= t_start)].copy()
    t_adf = adf[(adf['ds'] <= cutoff_date)
                & (adf['ds'] >= t_start)][['ds', 'ticket_count'] + b_cols +
                                          g_cols].copy()
    v_fdf = fdf_idx[(fdf_idx['ds'] > cutoff_date)].copy()

    cols = b_cols + g_cols
    g_adf_dict = {gc: fgc for gc, fgc in t_adf.groupby(cols)}
    g_fdf_dict = {gc: fgc for gc, fgc in v_fdf.groupby(cols)}
    arg_list = [[
        l, lf,
        g_adf_dict.get(l, None),
        g_fdf_dict.get(l, None), 'ticket_count', cols, normalize
    ] for l, lf in t_fdf[['ds', 'ticket_count', 'cfg_idx'] +
                         cols].groupby(cols)]
    d_list = s_ut.do_mp(t_ut.lasso_selection,
                        arg_list,
                        is_mp=True,
                        cpus=None,
                        do_sigkill=True)
    f_all = pd.concat([d['res'] for d in d_list], axis=0)
    f_all.dropna(inplace=True)
    f_all = f_all[f_all['y_pred'] > 0]
    f_all['y_pred'] = np.round(f_all['y_pred'].values, 0)
    return f_all
Example #3
0
def prepare_regressors(data_cfg,
                       _cfg,
                       d_cfg,
                       cutoff_date,
                       fcast_days,
                       init_date='2016-01-01'):
    s_ut.my_print('************* reading regressors ********************')
    reg_cfg = data_cfg.get('regressors', None)
    if reg_cfg is None:
        return None

    arg_list = [[rname, rcfg, cutoff_date, fcast_days, init_date]
                for rname, rcfg in reg_cfg.items()]
    rf_list = s_ut.do_mp(prepare_regs,
                         arg_list,
                         is_mp=True,
                         cpus=None,
                         do_sigkill=True)
    arg_list, rcol_list = fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days,
                                     pd.to_datetime(init_date))
    r_list = s_ut.do_mp(fcast_regressors,
                        arg_list,
                        is_mp=True,
                        cpus=None,
                        do_sigkill=True)
    r_list = list(filter(lambda x: x is not None,
                         r_list))  # drop all Nones if any
    reg_fdf = merge_regressors(
        r_list, rcol_list)  # merge all regressors in a single DF
    fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')
    if reg_fdf is not None:
        p_ut.save_df(reg_fdf, '~/my_tmp/reg_df')
        s_ut.my_print('final predicted regressors: fcast date: ' +
                      str(fcast_date.date()) + ' cutoff rate: ' +
                      str(cutoff_date.date()) + ' fcast_days: ' +
                      str(fcast_days) + ' gap: ' + str(
                          max([
                              reg_fdf[reg_fdf['language'] == l]
                              ['ds'].diff().dt.days.max()
                              for l in reg_fdf['language'].unique()
                          ])) + ' nulls: ' +
                      str(sum([reg_fdf[c].isnull().sum() for c in rcol_list])))
    else:
        s_ut.my_print('WARNING: no regressors available')
    return reg_fdf
Example #4
0
def best_regression_cfg(X_train,
                        y_train,
                        X_test,
                        y_test,
                        y_perf,
                        n_good,
                        topN_list,
                        obj_list,
                        used_cpus=0):
    # n_good: nbr of 'good' AdaBoost cfgs to avg on
    # obj_list: loss functions to apply to y col actual and predicted
    ab_cols = [
        'adb_estimators', 'max_depth', 'learning_rate', 'loss',
        'min_samples_split'
    ]  # AdaBoost cfg

    estimators_list = [25, 50, 100, 200]
    depth_list = [4, 8, 12, 16]
    learn_rate = [0.5, 1, 1.5, 2]
    min_samples_split_list = [2, 4, 8, 12]
    loss_list = ['linear', 'square', 'exponential']
    ab_cfgs = itertools.product(estimators_list, depth_list, learn_rate,
                                min_samples_split_list, loss_list)
    ab_cfgs = [
        list(x) + [X_train, y_train, X_test, y_test, y_perf, topN_list]
        for x in ab_cfgs
    ]
    f_list_ = s_ut.do_mp(ab_func,
                         ab_cfgs,
                         is_mp=True,
                         cpus=None,
                         do_sigkill=True,
                         verbose=False,
                         used_cpus=used_cpus)
    f_list = [x for l in f_list_ for x in l if len(x) > 0]
    f = pd.DataFrame([d for d in f_list if len(d) > 0])
    d_list = list()
    for obj in obj_list:  # for each obj func choose the best regressor
        fad = f.nsmallest(n=n_good, columns=[obj])
        if np.isinf(fad[obj].min()) is True:
            continue
        d_adb = {c: fad[c].mode().values[0]
                 for c in ab_cols}  # take the most common among the top n_good
        d_adb['topN'] = fad['topN'].mode().values[0]
        d_adb['obj'] = obj
        d_list.append(d_adb)
    return d_list
Example #5
0
def prepare_regressors(data_cfg,
                       _cfg,
                       d_cfg,
                       cutoff_date,
                       fcast_days,
                       int_type,
                       init_date='2016-01-01'):
    s_ut.my_print('************* reading regressors ********************')
    reg_cfg = data_cfg.get('regressors', None)
    if reg_cfg is None:
        return None

    init_date = pd.to_datetime(init_date)
    arg_list = [[rname, rcfg, cutoff_date, fcast_days, int_type, init_date]
                for rname, rcfg in reg_cfg.items()]
    obj_list = s_ut.do_mp(
        prepare_regs, arg_list, is_mp=True, cpus=None,
        do_sigkill=True)  # returns the list of regressor obj's
    reg_fdf = regressors.Regressor.merge_regressors(
        obj_list)  # merge all regressors in a single DF
    if reg_fdf is None:
        s_ut.my_print('WARNING: no regressors available')
    return reg_fdf
Example #6
0
def main(ts_name,
         cutoff_date,
         cfg_cols,
         to_db=True,
         df_cfg=None,
         is_mp=True):  #, is_fcast=True):
    cfg_file = get_fcast_cfg_file()
    with open(os.path.expanduser(cfg_file), 'r') as fp:
        d_cfg = json.load(fp)

    # if is_fcast is False and df_cfg is None:
    #     s_ut.my_print('ERROR: cannot generate ensemble fcasts without fcast configs')
    #     sys.exit()

    perf_df = df_cfg.copy() if df_cfg is not None else get_fcast_cfg(
        ts_name, cutoff_date)
    # if is_fcast is True and perf_df is None:
    #     s_ut.my_print('ERROR: cannot forecast without fcast configs')
    #     sys.exit()

    if_exists = d_cfg['if_exists']
    upr_horizon, lwr_horizon = d_cfg['upr_horizon_days'], d_cfg[
        'lwr_horizon_days']

    # ##################################
    # ##################################
    # if lang == 'Mandarin_Onshore':
    #     p_df['avg'] = p_df.mean(axis=1)
    #     p_df.sort_values(by='avg', inplace=True)
    #     print('lang: ' + str(lang))
    #     print(p_df.head(1))
    #     cfg_dict[lang] = [p_df.index[0]]
    # ##################################
    # ##################################

    # set up (ts, regressors, ...)
    ts_obj, reg_dict, cfg_dict, _ = dtp.initialize(cfg_file,
                                                   cutoff_date,
                                                   ts_name,
                                                   False,
                                                   is_mp=is_mp,
                                                   init_date='2016-01-01')

    # get fcasts
    fcast_list = list()
    ctr = 0
    for l, l_df in ts_obj.df_dict.items():  # by language
        # if l != 'English_NA':
        #     continue
        ctr += 1
        if ctr > 3:
            print(99999999999999999999)
            print('DEBUG @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
            break
        if l_df is None or len(l_df) == 0:
            s_ut.my_print('WARNING: no data for language: ' + str(l))
            continue
        pl = perf_df[perf_df['language'] == l].copy()
        if len(pl) == 0:
            s_ut.my_print('WARNING: no fcast cfg data for language: ' + str(l))
            continue
        ql = pl[cfg_cols].copy()
        cfgs = ql.to_dict(orient='records')

        print('\n')
        s_ut.my_print(
            '********************************* starting forecast for language: '
            + str(l))
        _ = [print('++ config: ' + str(d)) for d in cfgs]

        arg_list = dtp.prophet_prep(ts_obj, l, reg_dict.get(l, None), cfg_dict,
                                    upr_horizon, lwr_horizon, cfgs, False)
        f_list = s_ut.do_mp(dtp.tf,
                            arg_list,
                            is_mp=is_mp,
                            cpus=None,
                            do_sigkill=True)

        s_ut.my_print(
            '********************************* actual forecasts completed for language: '
            + str(l) + ': ' +
            str(len(f_list)))  # + ' is_fcast: ' + str(is_fcast))
        if len(f_list) > 0:
            fl = cfg_fcast(
                f_list, pl, cfg_cols
            )  # if is_fcast is False else actual_fcast(f_list, pl, avg_func, cutoff_date)
            fl['language'] = l
            fcast_list.append(fl)
        else:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' ERROR: no forecast DF for ' + str(l))

    table = 'sup.fct_ds_interaction_based_forecasts'
    if len(fcast_list) > 0:
        df_out = pd.concat(fcast_list, axis=0)
        file_out = to_table(to_db, table, pd.to_datetime(cutoff_date), ts_name,
                            if_exists, df_out)
    else:
        s_ut.my_print('ERROR: no forecast data generated')
        file_out = None
    return file_out
Example #7
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = '~/my_tmp/fbp/'
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
        run_date = pd.to_datetime('today')
    elif len(argv) == 3:
        ts_name, run_date = argv[-2:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, run_date, to_table = argv[1:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
            str(argv))
        sys.exit()

    # data cfg
    cutoff_date = tm_ut.get_last_sat(
        run_date
    )  # set to last saturday before run_date or the run_date if a saturday
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0])

    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W':
        fcast_date = fcast_date - pd.to_timedelta(
            1 + fcast_date.weekday(), unit='D')  # set to week starting Sunday
        cu = cutoff_date - pd.to_timedelta(
            1 + cutoff_date.weekday(), unit='D')  # set to week starting Sunday
        fcast_days = (fcast_date - cu).days  # multiple of 7
        upr_horizon = int(fcast_days / 7)  # in time scale units
    elif time_scale == 'D':
        upr_horizon = int(fcast_days)  # in time scale units
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' +
                      str(time_scale))
        sys.exit()

    s_ut.my_print('pid: ' + str(os.getpid()) +
                  ' ------------------------ start language forecast for ' +
                  str(ts_name) + ' from cutoff date ' +
                  str(cutoff_date.date()) + ' (excluded) to forecast date ' +
                  str(fcast_date.date()) +
                  '  (included) -----------------------')

    # get actuals
    actuals_df = dp.ts_actuals(
        ts_name, ts_cfg,
        cols)  # may have data past cutoff for accuracy checking
    if actuals_df['ds'].max() < cutoff_date:
        s_ut.my_print(
            'ERROR: no actuals available for forecast from cutoff date: ' +
            str(cutoff_date.date()))
        sys.exit()
    f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy(
    )  # actuals for forecast: only use up to cutoff date

    # adjust FCAST_DICT
    if len(FCAST_DICT['do_res']) == 2:  # True, False
        FCAST_DICT['do_res'] = [
            True
        ]  # MUST overwrite: the False care is always included and otherwise we double count.
    if len(ts_cfg.get('regressors', list())) == 0:
        FCAST_DICT['r_mode'] = [None]
        reg_dict = dict()
    else:
        reg_dict = regs.ens_fcast(
            ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days,
            init_date,
            f_actuals_df)  # stored by cutoff date on last Sat of the month

    # update init_date
    init_date = max([f_actuals_df['ds'].min()] +
                    [f['ds'].min() for f in reg_dict.values()])
    f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy()
    reg_dict = {
        lx: f[f['ds'] >= init_date].copy()
        for lx, f in reg_dict.items()
    }
    ts_cfg['init_date'] = init_date

    # set the list of fcast cfgs
    tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date,
                      time_scale)  # list of fcast cfg's
    fix_pars = [
        f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg,
        time_scale, upr_horizon
    ]
    arg_list = [
        fix_pars + [tlist[ix]] for ix in range(len(tlist))
    ]  # 2 fcasts are done per input cfg (do_res = true and do_res = false)
    n_fcfg = 2 * len(arg_list)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' +
                  str(n_fcfg) + ' fcast configs per language **********')

    # ###############################################################################
    # ###############################################################################
    # ###############################################################################
    if is_test:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=False,
                              cpus=None,
                              do_sigkill=True)
    else:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=True,
                              cpus=None,
                              do_sigkill=True)
    # ###############################################################################
    # ###############################################################################
    # ###############################################################################

    # join all the fcasted data into a flat list
    df_list = [f for f in df_list_ if f is not None]
    if len(df_list) > 0:
        ylist, alist = list(), list()
        for fl in df_list:
            if fl is not None:
                fl = set_cfg(fl.copy(), CFG_COLS)
                ylist.append(fl[[
                    'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg',
                    'fcast_date'
                ]].copy())
                alist.append(fl)

        # save basic fcast data
        fcast_df = pd.concat(
            ylist, axis=0)  # now all the list elements have the same columns
        fcast_df.reset_index(inplace=True, drop=True)

        ok_cfg = fcast_df['dim_cfg'].unique()
        s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) +
                      ' forecasts cfgs available for ' + str(ts_name) +
                      ' from cutoff date ' + str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
        # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'lang_fcast_'
        p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['ds', 'language', 'dim_cfg', 'yhat']
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()

        # save all fcast data  (y_upr, y_lwr, ...)
        all_df = pd.concat(
            alist, axis=0)  # now all the list elements have the same columns
        all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS]))
        all_df.reset_index(inplace=True, drop=True)
        all_df = all_df[all_cols].copy()
        all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'fcast_all_'
        p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
            mf = pd.melt(all_df,
                         id_vars=['ds', 'language', 'dim_cfg'],
                         var_name='key',
                         value_name='value')
            mf.dropna(subset=['value'], inplace=True)
            mf = mf[mf['value'] != 0.0].copy()
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(mf, list(mf.columns),
                              'sup.cx_language_forecast_detail', partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no forecasts available for ' +
                      str(ts_cfg['ts_key']) + ' from cutoff date ' +
                      str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
Example #8
0
def cfg_selection(ts_name, cutoff_date, cfg_cols, p_col):
    # finds the best forecast cfgs for each language
    df = get_cfg_data(ts_name, cfg_cols,
                      p_col)  # read all the cfgs and set the cfg_idx
    if df is None or len(df) == 0:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data return')
        s_ut.my_print('pid: ' + str(os.getpid()) + ': ERROR')
        sys.exit()

    # prepare df for cfg regression: only numerical vals in the cols
    df['h_mode'] = df['h_mode'].astype(int)
    df['do_res'] = df['do_res'].astype(int)

    # set the cutoff dates as increasing integers for the regression
    s = pd.Series(df['cutoff'].unique())
    s.sort_values(inplace=True)
    z = s.dt.to_period('M').diff().apply(lambda x: 0 if pd.isna(x) else x.n)
    fx = pd.DataFrame({'cutoff': s, 'n_ds': z.cumsum()})
    df = df.merge(fx, on='cutoff', how='left')
    df = set_rank(
        df,
        p_col)  # set the rank of each cfg for each language and cutoff_date

    q_list = [0.1, 1.0, 0.25]
    y_list = [p_col, 'rank_log', 'rank', 'rank_logistic']
    x_list = ['box-cox', None, 'logistic']
    o_list = ['a_loss', 'w_loss', 'p_loss', 'r_loss']
    n_list = [10, 15, 20]
    m_list = [3, 4]
    cpus = len(q_list) * len(y_list) * len(x_list) * len(m_list)
    arg_list_ = itertools.product(q_list, y_list, x_list, m_list)
    arg_list = [
        list(x) + [o_list, n_list, p_col, cutoff_date,
                   df.copy(), cpus] for x in arg_list_
    ]
    f_list = s_ut.do_mp(cfg_selection_,
                        arg_list,
                        is_mp=True,
                        cpus=None,
                        do_sigkill=True,
                        verbose=False)
    try:
        f_sel = pd.concat([x for x in f_list if x is not None])
    except ValueError as e:
        s_ut.my_print('ERROR: No data returned: ' + str(e))
        sys.exit()
    f_sel['cfg_idx'] = f_sel['cfg_idx'].apply(
        lambda x: json.dumps([int(y) for y in x[0]])
        if isinstance(x, list) and len(x) == 1 else np.nan)
    f_sel.dropna(subset=['cfg_idx'], inplace=True)
    update_df(f_sel, ts_name, cutoff_date)
    f_sel.drop_duplicates(inplace=True)
    f_sel['cfg_idx'] = f_sel['cfg_idx'].apply(
        lambda x: json.loads(x) if isinstance(x, str) else np.nan)
    f_sel.dropna(subset=['cfg_idx'], inplace=True)

    out_dir = '~/my_tmp/cfg_sel/'
    p_ut.save_df(f_sel, out_dir + 'cfg_sel_' + ts_name + '_' + cutoff_date)

    # select the best for each language
    f_best = f_sel.groupby(
        'language').apply(lambda x: x[x[p_col] == x[p_col].min()]).reset_index(
            drop=True)  # select by p_col: optimal but unknown
    f_best = f_best[[
        'xform', 'topN', 'obj', 'yobj', 'cfg_idx', 'f_err', 'f_err_max',
        'qtile', 'language', 'ts_name', 'cutoff'
    ]].copy()
    p_ut.save_df(f_best, out_dir + 'cfg_best_' + ts_name + '_' + cutoff_date)

    f_idx = df[['ts_name', 'language', 'cfg_idx', p_col, 'cfg_str'] +
               cfg_cols].copy()
    f_idx['cutoff'] = cutoff_date
    p_ut.save_df(f_idx, out_dir + 'cfg_idx_' + ts_name + '_' + cutoff_date)

    s_ut.my_print('pid: ' + str(os.getpid()) + ': DONE')
Example #9
0
def main(argv):
    print('usage: python lang_fcast.py <ts_name> <cutoff_date>')
    print(argv)
    ts_name, cutoff_date = argv
    this_file = os.path.basename(__file__)
    cfg_dir = '/'.join(FILE_PATH.split('/')[:-1])
    cfg_file = os.path.join(cfg_dir, 'config/' + this_file[:-3] + '_cfg.json')

    # validate the data, prepare regressors, holidays DF
    ts_obj, reg_dict, cfg_dict, train_days = dtp.initialize(
        cfg_file, cutoff_date, ts_name, True, init_date='2016-01-01')
    upr_horizon, lwr_horizon = cfg_dict['upr_horizon_days'], cfg_dict[
        'lwr_horizon_days']
    if_exists = cfg_dict['if_exists']
    cutoff_date = ts_obj.cutoff_date

    out_list = list()
    cu = cutoff_date + pd.to_timedelta(
        upr_horizon, unit='D')  # actual cutoff date for training
    ds = str(cu.date())
    # ctr = 0
    # train_days = [25, 35]
    for l, t_df in ts_obj.df_dict.items():
        # if l != 'Mandarin':
        #     continue
        s_ut.my_print(
            '\n\n****************************** starting language: ' + str(l))
        lang_list = list()
        if t_df is not None:
            for tdays in train_days:
                tlist = dtp.get_f_cfgs(t_df,
                                       l,
                                       cutoff_date,
                                       tdays,
                                       upr_horizon,
                                       cfg_dict,
                                       is_train=True)
                if tlist is None:
                    s_ut.my_print('WARNING: language ' + str(l) +
                                  ' and training cutoff date ' +
                                  str(cutoff_date.date()) +
                                  ' and training days ' + str(tdays) +
                                  ' has NO fcast configs')
                    continue
                else:
                    arg_list = dtp.prophet_prep(ts_obj, l,
                                                reg_dict.get(l, None),
                                                cfg_dict, upr_horizon,
                                                lwr_horizon, tlist, True)
                    s_ut.my_print('pid: ' + str(os.getpid()) +
                                  ' ************* forecasts for ' + str(l) +
                                  ' with ' + str(tdays) + ' train days and ' +
                                  str(len(arg_list)) + ' configs')
                    f_list = s_ut.do_mp(dtp.tf,
                                        arg_list,
                                        is_mp=True,
                                        cpus=len(arg_list),
                                        do_sigkill=True)
                    if f_list is None:
                        s_ut.my_print('pid: ' + str(os.getpid()) +
                                      ': No results with ' + str(tdays) +
                                      ' training days')
                        f_list = list()
                    else:
                        s_ut.my_print('pid: ' + str(os.getpid()) + ': ' +
                                      str(len(f_list)) + ' results with ' +
                                      str(tdays) + ' training days')

                    # save the fcast configs
                    if len(f_list) > 0:
                        s_ut.my_print('pid: ' + str(os.getpid()) +
                                      ' concatenating ' + str(len(f_list)) +
                                      ' DFs for ' + str(l))
                        l_df = pd.concat([f for f in f_list], axis=0)
                        l_df['language'] = l
                        s_ut.my_print('pid: ' + str(os.getpid()) +
                                      ' Language ' + str(l) + ' has ' +
                                      str(len(l_df)) + ' fcast cfgs with ' +
                                      str(tdays) + ' training days')
                        l_df.reset_index(inplace=True, drop=True)
                        l_df[
                            'ds'] = ds  # here we only save cfg's not fcasts. Use ds for partition
                        l_df['ts_name'] = ts_name
                        l_df['cutoff'] = ds
                        lang_list.append(l_df)
                        out_list.append(l_df)
                    else:
                        s_ut.my_print('pid: ' + str(os.getpid()) +
                                      ' WARNING: no DF for ' + str(l))
        else:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' WARNING: no training DF for ' + str(l))
        # ctr += 1
        # if ctr >= 2:
        #     break
        if len(lang_list) > 0:  # save language level results
            fl = pd.concat(lang_list, axis=0)
            p_ut.save_df(
                fl, '~/my_tmp/fcast_cfg_v2_' + ds + '_' + ts_name + '_' + l)

    # all training done or this TS. Save data
    if len(out_list) > 0:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' *************** saving training data ***********')
        df_all = pd.concat(out_list, axis=0)
        df_all.drop_duplicates(inplace=True)
        df_all.reset_index(inplace=True, drop=True)
        df_all['ds'] = ds  # here we only save cfg's not fcasts
        df_all['ts_name'] = ts_name
        df_all['cutoff'] = ds
        p_ut.save_df(df_all,
                     '~/my_tmp/fcast_cfg/fcast_cfg_v2_' + ds + '_' + ts_name)
        df_all.drop(['ds', 'ts_name'], inplace=True,
                    axis=1)  # not needed to push
        partition_ = {'ds': ds, 'ts_name': ts_name}
        table = 'sup.fct_cx_forecast_config_v3'
        try:  # only hive works with the partition argument
            with s_ut.suppress_stdout_stderr():
                import airpy as ap
            ap.hive.push(df_all,
                         table=table,
                         if_exists=if_exists,
                         partition=partition_,
                         table_props={
                             'abb_retention_days': '-1',
                             'abb_retention_days_reason': 'fact table. No pii'
                         })
            s_ut.my_print('data saved to table ' + table + ' for ' + ts_name +
                          ' and ds ' + ds)
            print('DONE')
        except:
            s_ut.my_print('ERROR: could not save to table ' + table + ' for ' +
                          ts_name)
    else:
        s_ut.my_print('ERROR: no output')