Esempio n. 1
0
def ens_fcast(ts_name, regs, cutoff_date, time_scale, fcast_days, init_date,
              a_df):
    r_list = list()
    for rname in regs:
        r_cfg, _ = dp.ts_setup(rname, cutoff_date, init_date, time_scale)
        if r_cfg is None:
            s_ut.my_print('ERROR: invalid regressor name: ' + rname)
            sys.exit()

        if r_cfg['do_fcast'] is True:
            qry = 'select * from sup.cx_ens_forecast where cutoff = \'' + str(
                cutoff_date.date()) + '\' and ts_name = \'' + rname + '\';'
            rdf = hql.from_tble(qry, ['ds'], use_cache=USE_CACHE, renew=RENEW)
            if rdf is None:  # no ens fcast file found
                s_ut.my_print('ERROR: no forecast for regressor: ' + rname)
                sys.exit()
            else:
                cols = ['ds', 'language', 'yhat'
                        ] if 'language' in rdf.columns else ['ds', 'yhat']
                rdf = rdf[rdf['ds'] > cutoff_date][cols].copy()
                adf = get_actuals(r_cfg, init_date='2016-01-01')
                adf = adf[adf['ds'] <= cutoff_date].copy()
                adf.rename(columns={r_cfg['ycol']: 'yhat'}, inplace=True)
                rdf = pd.concat([adf[cols].copy(), rdf], axis=0)
        else:  # static regressors
            s_ut.my_print(rname + ' is a static regressor')
            try:
                reg_func = getattr(
                    sys.modules[__name__],
                    rname)  # function to set up the static regressor
                args = [cutoff_date, init_date, fcast_days, time_scale]
                rdf = reg_func(*args)
            except AttributeError as e:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' No static regressor with name ' + rname +
                              ': ' + str(e))
                rdf = None

            if rdf is not None:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' found static regressor: ' + str(rname))
            else:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' WARNING: regressor ' + rname +
                              ' not found <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        if rdf is not None and len(rdf) > 0:
            rdf.rename(columns={'yhat': rname}, inplace=True)
            r_list.append(rdf)
    if len(r_list) > 0:
        r_dict = merge_regressors_(r_list, init_date)
        r_dict = {
            lx: selector(ts_name, lx, a_df[a_df['language'] == lx].copy(), rl,
                         cutoff_date)
            for lx, rl in r_dict.items()
        }
        s_dict = {lx: fl for lx, fl in r_dict.items() if fl is not None}
        return s_dict
    else:
        return dict()
Esempio n. 2
0
def get_year_ticket(yyyy, cutoff_date, ts_name):
    start, end = xl_ut.iso_dates(yyyy)
    cutoff_date = pd.to_datetime(cutoff_date)
    ts_cfg, _ = dp.ts_setup(ts_name, cutoff_date, pd.to_datetime('2016-01-01'), 'W')
    a_df = dp.ts_actuals(ts_name, ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'], drop_cols=False)
    a_df['ts_name'] = ts_name
    f_df = ap.hive.query('select * from sup.cx_weekly_forecasts where ts_name = \'' + ts_name + '\' and cutoff =\'' + str(cutoff_date.date()) + '\';')
    f_df.columns = [c.split('.')[1] for c in f_df.columns]
    a_df['ds'] = pd.to_datetime(a_df['ds'].values)
    a_df = a_df[(a_df['ds'] >= start) & (a_df['ds'] <= cutoff_date)].copy()
    f_df['ds'] = pd.to_datetime(f_df['ds'].values)
    f_df = f_df[f_df['ds'] <= end].copy()
    f_df.rename(columns={'yhat': ts_name}, inplace=True)
    return pd.concat([a_df, f_df], axis=0)
Esempio n. 3
0
def basic_perf(ts_name, cutoff_date, upr, lwr, init_date='2016-01-01', time_scale='W'):
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    # actuals
    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    actuals_df.drop_duplicates(inplace=True)                          # not sure why there are dups. Table problem?

    # forecasts
    f_df = get_lang_fcast(ts_cfg, cutoff_date, eq=True)
    f_df.drop_duplicates(inplace=True)                                # not sure why there are dups. Table problem?
    f_idx = fcast_idx(f_df)
    f_df = f_df.merge(f_idx, on='dim_cfg', how='left')
    f_df.drop('dim_cfg', axis=1, inplace=True)
    f_df.rename(columns={'index': 'dim_cfg'}, inplace=True)
    df = f_df.merge(actuals_df, on=['ds', 'language'], how='left')
    df.drop('cutoff', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)                                  # not sure why there are dups
    p_ut.save_df(df, '~/my_tmp/df_all')

    # perf
    cu_sun = cutoff_date - pd.to_timedelta(6, unit='D')
    sf = df[['ds', 'language', 'y']].drop_duplicates()
    sf.dropna(inplace=True)
    f_shift = sf.groupby('language').apply(get_shift, cu_sun=cu_sun, upr=upr, lwr=lwr).reset_index()  # find best shift
    nf = df.merge(f_shift, on=['ds', 'language', 'y'], how='left')
    nf.set_index(['language', 'dim_cfg'], inplace=True)     # avoids drop of nuisance cols
    nf.dropna(inplace=True)
    nf.drop_duplicates(inplace=True)
    zperf = nf.groupby(['language', 'dim_cfg']).apply(cfg_perf, upr=upr, lwr=lwr, cu_sun=cu_sun).reset_index(drop=True)  # do not groupby df.index
    zperf.dropna(inplace=True)
    p_ut.save_df(zperf, '~/my_tmp/zperf_' + ts_name + '_' + str(cutoff_date.date()))
    nf.reset_index(inplace=True)
    p_ut.save_df(nf, '~/my_tmp/nf')
    fout = nf.merge(zperf, on=list(nf.columns), how='left')
    return actuals_df, fout
Esempio n. 4
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 8
    evals = 50
    by_lang = False
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv[1:]) == 1:
        ts_name = argv[-1]
        cutoff_date = pd.to_datetime('today')
        to_table = False
    elif len(argv[1:]) == 2:
        ts_name, cutoff_date = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv[1:]) == 3:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    # actuals
    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)

    # forecasts
    f_df = fp.get_lang_fcast(ts_cfg, cutoff_date)
    fcast_date = cutoff_date + pd.to_timedelta(upr, unit=time_scale)

    perf_list = list()
    for xens in [
            'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor',
            'GradientBoostingRegressor', 'RandomForestRegressor',
            'ExtraTreesRegressor', 'lasso'
    ]:
        fcast_df = ep.make_fcast(ts_name,
                                 f_df,
                                 actuals_df,
                                 cutoff_date,
                                 fcast_date,
                                 xens,
                                 evals,
                                 by_lang, (lwr, upr),
                                 lwr=lwr,
                                 upr=upr)
        perf_df = perf.fcast_perf(fcast_df, actuals_df, cutoff_date, lwr, upr,
                                  time_scale, xens)
        if perf_df is None:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' WARNING: forecast performance detail failed for ' +
                          ts_name + ' ,cutoff date ' +
                          str(cutoff_date.date()) + ' and ensemble: ' +
                          str(xens))
        else:
            perf_df['ts_name'] = ts_name
            perf_list.append(perf_df)
    if len(perf_list) > 0:
        pf = pd.concat(perf_list, axis=0)
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_detail_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance_detail',
                              partition)
            if ret != 0:
                s_ut.my_print(
                    'pid: ' + str(os.getpid()) +
                    ' ERROR: forecast performance detail failed for ' +
                    ts_name + ' and cutoff date ' + str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print(
            'pid: ' + str(os.getpid()) +
            ' ERROR: no data for forecast performance detail failed for ' +
            ts_name + ' and cutoff date ' + str(cutoff_date.date()))
        sys.exit()
Esempio n. 5
0
        _, cutoff_date = sys.argv
        cutoff_date = pd.to_datetime(cutoff_date)
    else:
        print('invalid args: ' + str(sys.argv))
        sys.exit()

    # #########################
    actuals_weeks = 12
    target_year = 2020
    init_date = pd.to_datetime('2016-01-01')
    time_scale = 'W'
    ts_name = 'ticket_count'
    # #########################

    # get actuals
    ts_cfg, _ = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    adf = dp.ts_actuals(ts_name, ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'], drop_cols=False, use_cache=False)
    start, end = tm_ut.iso_dates(target_year - 1)
    adf = adf[adf['ds'] >= start]   # use for YoY comparison
    adf.rename(columns={'ticket_count': 'y'}, inplace=True)
    p_ut.save_df(adf, '~/my_tmp/a_df')

    # current forecast
    s_ut.my_print('current forecast')
    cf_df = pd.concat([get_fcast(ts_name, ts_cfg, cutoff_date, bu, use_cache=False) for bu in ts_cfg['business_units']], axis=0)
    cf_df = cf_df[cf_df['ds'] > cutoff_date].copy()
    p_ut.save_df(cf_df, '~/my_tmp/cf_df')

    # target year summary
    xl_ut.year_summary(adf, cf_df, target_year, cutoff_date)
Esempio n. 6
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = '~/my_tmp/fbp/'
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
        run_date = pd.to_datetime('today')
    elif len(argv) == 3:
        ts_name, run_date = argv[-2:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, run_date, to_table = argv[1:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
            str(argv))
        sys.exit()

    # data cfg
    cutoff_date = tm_ut.get_last_sat(
        run_date
    )  # set to last saturday before run_date or the run_date if a saturday
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0])

    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W':
        fcast_date = fcast_date - pd.to_timedelta(
            1 + fcast_date.weekday(), unit='D')  # set to week starting Sunday
        cu = cutoff_date - pd.to_timedelta(
            1 + cutoff_date.weekday(), unit='D')  # set to week starting Sunday
        fcast_days = (fcast_date - cu).days  # multiple of 7
        upr_horizon = int(fcast_days / 7)  # in time scale units
    elif time_scale == 'D':
        upr_horizon = int(fcast_days)  # in time scale units
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' +
                      str(time_scale))
        sys.exit()

    s_ut.my_print('pid: ' + str(os.getpid()) +
                  ' ------------------------ start language forecast for ' +
                  str(ts_name) + ' from cutoff date ' +
                  str(cutoff_date.date()) + ' (excluded) to forecast date ' +
                  str(fcast_date.date()) +
                  '  (included) -----------------------')

    # get actuals
    actuals_df = dp.ts_actuals(
        ts_name, ts_cfg,
        cols)  # may have data past cutoff for accuracy checking
    if actuals_df['ds'].max() < cutoff_date:
        s_ut.my_print(
            'ERROR: no actuals available for forecast from cutoff date: ' +
            str(cutoff_date.date()))
        sys.exit()
    f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy(
    )  # actuals for forecast: only use up to cutoff date

    # adjust FCAST_DICT
    if len(FCAST_DICT['do_res']) == 2:  # True, False
        FCAST_DICT['do_res'] = [
            True
        ]  # MUST overwrite: the False care is always included and otherwise we double count.
    if len(ts_cfg.get('regressors', list())) == 0:
        FCAST_DICT['r_mode'] = [None]
        reg_dict = dict()
    else:
        reg_dict = regs.ens_fcast(
            ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days,
            init_date,
            f_actuals_df)  # stored by cutoff date on last Sat of the month

    # update init_date
    init_date = max([f_actuals_df['ds'].min()] +
                    [f['ds'].min() for f in reg_dict.values()])
    f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy()
    reg_dict = {
        lx: f[f['ds'] >= init_date].copy()
        for lx, f in reg_dict.items()
    }
    ts_cfg['init_date'] = init_date

    # set the list of fcast cfgs
    tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date,
                      time_scale)  # list of fcast cfg's
    fix_pars = [
        f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg,
        time_scale, upr_horizon
    ]
    arg_list = [
        fix_pars + [tlist[ix]] for ix in range(len(tlist))
    ]  # 2 fcasts are done per input cfg (do_res = true and do_res = false)
    n_fcfg = 2 * len(arg_list)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' +
                  str(n_fcfg) + ' fcast configs per language **********')

    # ###############################################################################
    # ###############################################################################
    # ###############################################################################
    if is_test:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=False,
                              cpus=None,
                              do_sigkill=True)
    else:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=True,
                              cpus=None,
                              do_sigkill=True)
    # ###############################################################################
    # ###############################################################################
    # ###############################################################################

    # join all the fcasted data into a flat list
    df_list = [f for f in df_list_ if f is not None]
    if len(df_list) > 0:
        ylist, alist = list(), list()
        for fl in df_list:
            if fl is not None:
                fl = set_cfg(fl.copy(), CFG_COLS)
                ylist.append(fl[[
                    'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg',
                    'fcast_date'
                ]].copy())
                alist.append(fl)

        # save basic fcast data
        fcast_df = pd.concat(
            ylist, axis=0)  # now all the list elements have the same columns
        fcast_df.reset_index(inplace=True, drop=True)

        ok_cfg = fcast_df['dim_cfg'].unique()
        s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) +
                      ' forecasts cfgs available for ' + str(ts_name) +
                      ' from cutoff date ' + str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
        # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'lang_fcast_'
        p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['ds', 'language', 'dim_cfg', 'yhat']
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()

        # save all fcast data  (y_upr, y_lwr, ...)
        all_df = pd.concat(
            alist, axis=0)  # now all the list elements have the same columns
        all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS]))
        all_df.reset_index(inplace=True, drop=True)
        all_df = all_df[all_cols].copy()
        all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'fcast_all_'
        p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
            mf = pd.melt(all_df,
                         id_vars=['ds', 'language', 'dim_cfg'],
                         var_name='key',
                         value_name='value')
            mf.dropna(subset=['value'], inplace=True)
            mf = mf[mf['value'] != 0.0].copy()
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(mf, list(mf.columns),
                              'sup.cx_language_forecast_detail', partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no forecasts available for ' +
                      str(ts_cfg['ts_key']) + ' from cutoff date ' +
                      str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
Esempio n. 7
0
def main(argv):
    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
    elif len(argv) == 3:
        ts_name, to_table = argv[1:]
        try:
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, to_table): ' +
                          str(argv))
            sys.exit()
    else:
        s_ut.my_print('ERROR: invalid arguments (ts_name, to_table): ' +
                      str(argv))
        sys.exit()

    if any([bu in ts_name for bu in ['Homes', 'Experiences', 'China']]):
        s_ut.my_print('ERROR: time series cannot be a BU time series: ' +
                      str(ts_name))
        sys.exit()

    data_cfg = os.path.expanduser(
        '~/my_repos/capacity_planning/forecast/config/ratio_forecast_cfg.json')
    if os.path.isfile(data_cfg):
        with open(data_cfg, 'r') as fptr:
            rf_dict = json.load(fptr)
    else:
        s_ut.my_print('ERROR: ' + data_cfg + ' file not found')
        sys.exit()

    d_date = rf_dict.get('data_date', None)
    if d_date is None:
        s_ut.my_print('ERROR: data_date cannot be null')
        sys.exit()
    data_date = pd.to_datetime(
        d_date)  # this is the cutoff date we get data from tables

    a_date = rf_dict.get(
        'adjust_date',
        None)  # if None, nothing to adjust and adj_date = data_date
    adjust_date = data_date if a_date is None else pd.to_datetime(
        a_date)  # this is the actual cutoff date

    window = rf_dict.get('ratio_windows', dict())
    if len(window) == 0:  # not set
        window = {
            'default': {
                'start': adjust_date - pd.to_timedelta(6, unit='W'),
                'end': adjust_date
            }
        }
    else:
        for k, v in window.items():
            for kk, vv in v.items():
                v[kk] = pd.to_datetime(vv)

    s_ut.my_print('************************* read table date: ' +
                  str(data_date.date()) +
                  ' ********************************************')
    s_ut.my_print('************************* write table date: ' +
                  str(adjust_date.date()) +
                  ' *******************************************')

    # ###############################
    # ###############################
    time_scale = 'W'
    init_date = pd.to_datetime('2016-01-01')
    # ###############################
    # ###############################

    df_tilde, bottom_ts = hts.main(
        ts_name, data_date, do_cov=True
    )  # coherent forecasts at language level + language level adjustments
    f_df = adj.main(
        df_tilde, 'language', bottom_ts, ts_name, adjust_date
    )  # must adjust at language level before service level ratios
    fr_list, fr_cols = list(), list()
    a_list = list()
    ts_list = bottom_ts  # ratios only on bottom_ts then aggregate to top TS
    for ts in ts_list:
        s_ut.my_print('============= starting ' + str(ts))
        ts_cfg, _ = dp.ts_setup(ts, data_date, init_date, time_scale)
        a_df = dp.ts_actuals(
            ts,
            ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'],
            drop_cols=False)
        b_df = filter_actuals(a_df, window)
        fr = ts_ratio(ts, b_df.copy(), f_df[['ds', 'language',
                                             ts + '_tilde']].copy(), window,
                      data_date)
        fr_list.append(fr)
        fr_cols.append(fr.columns)
        a_list.append(b_df)
        check_ratios(ts, b_df, fr, True, 'service_tier')
        check_ratios(ts, b_df, fr, False, 'service_tier')
        check_ratios(ts, b_df, fr, True, 'channel')
        check_ratios(ts, b_df, fr, False, 'channel')

    # must adjust together to ensure coherence
    fr = reduce(
        lambda x, y: x.merge(
            y, on=['ds', 'language', 'channel', 'service_tier'], how='outer'),
        fr_list) if len(fr_list) > 0 else None
    fr.fillna(0, inplace=True)
    for k_col in ['channel',
                  'service_tier']:  # language adj must be done before ratios
        fr = adj.main(fr, k_col, bottom_ts, ts_name, adjust_date)

    # save data
    f_list = list()
    for idx in range(len(ts_list)):
        ts = ts_list[idx]
        fx = fr[fr_cols[idx]].copy()
        fx.rename(columns={ts + '_tilde': 'yhat'}, inplace=True)
        fx['yhat'] = np.round(
            fx['yhat'].values,
            0)  # this makes input totals and output totals to be a bit off
        fx = fx[fx['yhat'] > 0]
        fx['ts_name'] = ts
        fx['cutoff'] = adjust_date
        f_list.append(fx)

    # get the aggregate series
    fall = pd.concat(f_list, axis=0)
    gall = fall.groupby(['ds', 'language', 'channel',
                         'service_tier']).sum(numeric_only=True).reset_index()
    gall['cutoff'] = adjust_date
    gall['ts_name'] = ts_name

    # align cols (ap gets confused otherwise?)
    tcols = [
        'ds', 'language', 'channel', 'service_tier', 'yhat', 'ts_name',
        'cutoff'
    ]
    fall = fall[tcols].copy()
    gall = gall[tcols].copy()

    # final DF to save
    fout = pd.concat([gall, fall], axis=0)
    p_ut.save_df(
        fout,
        '~/my_tmp/fbp/ratios_fcast_' + ts_name + '_' + str(adjust_date.date()))
    ts_cfg, _ = dp.ts_setup('ticket_count', data_date, init_date, time_scale)
    a_df = dp.ts_actuals(
        'ticket_count',
        ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'],
        drop_cols=False)
    p_ut.save_df(a_df, '~/my_tmp/a_df_ticket_count_' + str(adjust_date.date()))

    # data summary
    s_ut.my_print('**************** Data Summary *******************')
    for c in ['language', 'channel', 'service_tier']:
        s_ut.my_print('unique ' + c + ': ' + str(fout[c].unique()))

    # save to DB
    if to_table is True:
        tcols.remove('cutoff')
        tcols.remove('ts_name')
        tcols.insert(1, 'ds_week_starting')
        tcols.insert(2, 'fcst_date_inv_ending')  # ds_week_ending
        fout = fout[fout['ds'] > adjust_date.date()].copy(
        )  # only save forecasted values
        fout['ds_week_starting'] = fout['ds']
        fout['fcst_date_inv_ending'] = fout['ds'] + pd.to_timedelta(6,
                                                                    unit='D')
        for ts in fout['ts_name'].unique():
            partition = {'cutoff': str(adjust_date.date()), 'ts_name': ts}
            tb_df = fout[fout['ts_name'] == ts].copy()
            ret = hql.to_tble(tb_df, tcols, 'sup.cx_weekly_forecasts',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              ts_name + ' and cutoff date ' +
                              str(adjust_date.date()))
                sys.exit()
            else:
                s_ut.my_print(
                    '>>>>>>>>>>>>>>> SUCCESS: data saved to table <<<<<<<<<<<<<<<<<<<'
                )
    else:
        s_ut.my_print(
            '>>>>>>>>>>>>>>> WARNING: no data saved to table <<<<<<<<<<<<<<<<<<<'
        )
Esempio n. 8
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = os.path.expanduser('~/my_tmp/fbp/')
    evals = 250
    by_lang = False
    lwr, upr = 9, 12
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 3:
        ts_name, cutoff_date = argv[-2:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
            sys.exit()
    else:
        s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
        sys.exit()

    # data cfg
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W' and fcast_date.weekday() != 6:                 # set fcast date to week starting Sunday unless it is a Sunday already
        fcast_date = fcast_date - pd.to_timedelta(1 + fcast_date.weekday(), unit='D')

    s_ut.my_print('pid: ' + str(os.getpid()) + ' ------------------------ start ens forecast for ' + str(ts_name) + ' from cutoff date '
                  + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + '  (included) -----------------------')

    a_df = dp.ts_actuals(ts_name, ts_cfg, cols)                                  # get actuals
    a_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    fcast_df = fp.get_lang_fcast(ts_cfg, cutoff_date)      # get fcasts

    if fcast_df is not None and a_df is not None:
        s_ut.my_print(ts_name + ': combining ' + str(fcast_df['dim_cfg'].nunique()) + ' forecast configs')
        xens_ = get_ens(ts_name, cutoff_date)  # ts_cfg['ens'].get(str(cutoff_date.month), ens_dict['default'])

        s_ut.my_print('aggregation for ' + ts_name + ' done with ' + xens_)
        ts_fcast = ep.make_fcast(ts_name, fcast_df, a_df, cutoff_date, fcast_date, xens_, evals, by_lang, (lwr, upr), lwr=lwr, upr=upr)
        ts_fcast['fcast_date'] = fcast_date

        p_ut.save_df(ts_fcast, froot + 'ens_fcast_' + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            cols = ['ds', 'language', 'ens', 'yhat']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key']}
            ret = hql.to_tble(ts_fcast, cols, 'sup.cx_ens_forecast', partition)
            if ret != 0:
                s_ut.my_print('ERROR: DB write for ' + ts_name + ' ens forecast ' + ' at ' + str(cutoff_date.date()) + ' failed')
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('ERROR: no actuals or no data for errors of ' + ts_name + ' at ' + str(cutoff_date.date()))
Esempio n. 9
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 8
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv) == 2:
        ts_name = argv[-2:]
        cutoff_date = pd.to_datetime('today')
        to_table = False
    elif len(argv) == 3:
        ts_name, cutoff_date = argv[-2:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    fcast_df = fp.get_ens_fcast(ts_name, ts_cfg, cutoff_date)
    ens = fcast_df.loc[fcast_df.index[0], 'ens']
    f_df = ep.fcast_filter(fcast_df, actuals_df, ts_name,
                           cutoff_date + pd.to_timedelta(upr, unit=time_scale),
                           cutoff_date, time_scale)
    pf = fcast_perf(f_df, actuals_df, cutoff_date, lwr, upr, time_scale, ens)
    if pf is None:
        return
    else:
        pf['ts_name'] = ts_name
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: forecast performance failed for ' +
                              ts_name + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
    print('DONE')