Esempio n. 1
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 8
    evals = 50
    by_lang = False
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv[1:]) == 1:
        ts_name = argv[-1]
        cutoff_date = pd.to_datetime('today')
        to_table = False
    elif len(argv[1:]) == 2:
        ts_name, cutoff_date = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv[1:]) == 3:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    # actuals
    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)

    # forecasts
    f_df = fp.get_lang_fcast(ts_cfg, cutoff_date)
    fcast_date = cutoff_date + pd.to_timedelta(upr, unit=time_scale)

    perf_list = list()
    for xens in [
            'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor',
            'GradientBoostingRegressor', 'RandomForestRegressor',
            'ExtraTreesRegressor', 'lasso'
    ]:
        fcast_df = ep.make_fcast(ts_name,
                                 f_df,
                                 actuals_df,
                                 cutoff_date,
                                 fcast_date,
                                 xens,
                                 evals,
                                 by_lang, (lwr, upr),
                                 lwr=lwr,
                                 upr=upr)
        perf_df = perf.fcast_perf(fcast_df, actuals_df, cutoff_date, lwr, upr,
                                  time_scale, xens)
        if perf_df is None:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' WARNING: forecast performance detail failed for ' +
                          ts_name + ' ,cutoff date ' +
                          str(cutoff_date.date()) + ' and ensemble: ' +
                          str(xens))
        else:
            perf_df['ts_name'] = ts_name
            perf_list.append(perf_df)
    if len(perf_list) > 0:
        pf = pd.concat(perf_list, axis=0)
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_detail_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance_detail',
                              partition)
            if ret != 0:
                s_ut.my_print(
                    'pid: ' + str(os.getpid()) +
                    ' ERROR: forecast performance detail failed for ' +
                    ts_name + ' and cutoff date ' + str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print(
            'pid: ' + str(os.getpid()) +
            ' ERROR: no data for forecast performance detail failed for ' +
            ts_name + ' and cutoff date ' + str(cutoff_date.date()))
        sys.exit()
Esempio n. 2
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = '~/my_tmp/fbp/'
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
        run_date = pd.to_datetime('today')
    elif len(argv) == 3:
        ts_name, run_date = argv[-2:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, run_date, to_table = argv[1:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
            str(argv))
        sys.exit()

    # data cfg
    cutoff_date = tm_ut.get_last_sat(
        run_date
    )  # set to last saturday before run_date or the run_date if a saturday
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0])

    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W':
        fcast_date = fcast_date - pd.to_timedelta(
            1 + fcast_date.weekday(), unit='D')  # set to week starting Sunday
        cu = cutoff_date - pd.to_timedelta(
            1 + cutoff_date.weekday(), unit='D')  # set to week starting Sunday
        fcast_days = (fcast_date - cu).days  # multiple of 7
        upr_horizon = int(fcast_days / 7)  # in time scale units
    elif time_scale == 'D':
        upr_horizon = int(fcast_days)  # in time scale units
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' +
                      str(time_scale))
        sys.exit()

    s_ut.my_print('pid: ' + str(os.getpid()) +
                  ' ------------------------ start language forecast for ' +
                  str(ts_name) + ' from cutoff date ' +
                  str(cutoff_date.date()) + ' (excluded) to forecast date ' +
                  str(fcast_date.date()) +
                  '  (included) -----------------------')

    # get actuals
    actuals_df = dp.ts_actuals(
        ts_name, ts_cfg,
        cols)  # may have data past cutoff for accuracy checking
    if actuals_df['ds'].max() < cutoff_date:
        s_ut.my_print(
            'ERROR: no actuals available for forecast from cutoff date: ' +
            str(cutoff_date.date()))
        sys.exit()
    f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy(
    )  # actuals for forecast: only use up to cutoff date

    # adjust FCAST_DICT
    if len(FCAST_DICT['do_res']) == 2:  # True, False
        FCAST_DICT['do_res'] = [
            True
        ]  # MUST overwrite: the False care is always included and otherwise we double count.
    if len(ts_cfg.get('regressors', list())) == 0:
        FCAST_DICT['r_mode'] = [None]
        reg_dict = dict()
    else:
        reg_dict = regs.ens_fcast(
            ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days,
            init_date,
            f_actuals_df)  # stored by cutoff date on last Sat of the month

    # update init_date
    init_date = max([f_actuals_df['ds'].min()] +
                    [f['ds'].min() for f in reg_dict.values()])
    f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy()
    reg_dict = {
        lx: f[f['ds'] >= init_date].copy()
        for lx, f in reg_dict.items()
    }
    ts_cfg['init_date'] = init_date

    # set the list of fcast cfgs
    tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date,
                      time_scale)  # list of fcast cfg's
    fix_pars = [
        f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg,
        time_scale, upr_horizon
    ]
    arg_list = [
        fix_pars + [tlist[ix]] for ix in range(len(tlist))
    ]  # 2 fcasts are done per input cfg (do_res = true and do_res = false)
    n_fcfg = 2 * len(arg_list)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' +
                  str(n_fcfg) + ' fcast configs per language **********')

    # ###############################################################################
    # ###############################################################################
    # ###############################################################################
    if is_test:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=False,
                              cpus=None,
                              do_sigkill=True)
    else:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=True,
                              cpus=None,
                              do_sigkill=True)
    # ###############################################################################
    # ###############################################################################
    # ###############################################################################

    # join all the fcasted data into a flat list
    df_list = [f for f in df_list_ if f is not None]
    if len(df_list) > 0:
        ylist, alist = list(), list()
        for fl in df_list:
            if fl is not None:
                fl = set_cfg(fl.copy(), CFG_COLS)
                ylist.append(fl[[
                    'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg',
                    'fcast_date'
                ]].copy())
                alist.append(fl)

        # save basic fcast data
        fcast_df = pd.concat(
            ylist, axis=0)  # now all the list elements have the same columns
        fcast_df.reset_index(inplace=True, drop=True)

        ok_cfg = fcast_df['dim_cfg'].unique()
        s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) +
                      ' forecasts cfgs available for ' + str(ts_name) +
                      ' from cutoff date ' + str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
        # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'lang_fcast_'
        p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['ds', 'language', 'dim_cfg', 'yhat']
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()

        # save all fcast data  (y_upr, y_lwr, ...)
        all_df = pd.concat(
            alist, axis=0)  # now all the list elements have the same columns
        all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS]))
        all_df.reset_index(inplace=True, drop=True)
        all_df = all_df[all_cols].copy()
        all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'fcast_all_'
        p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
            mf = pd.melt(all_df,
                         id_vars=['ds', 'language', 'dim_cfg'],
                         var_name='key',
                         value_name='value')
            mf.dropna(subset=['value'], inplace=True)
            mf = mf[mf['value'] != 0.0].copy()
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(mf, list(mf.columns),
                              'sup.cx_language_forecast_detail', partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no forecasts available for ' +
                      str(ts_cfg['ts_key']) + ' from cutoff date ' +
                      str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
Esempio n. 3
0
def main(argv):
    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
    elif len(argv) == 3:
        ts_name, to_table = argv[1:]
        try:
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, to_table): ' +
                          str(argv))
            sys.exit()
    else:
        s_ut.my_print('ERROR: invalid arguments (ts_name, to_table): ' +
                      str(argv))
        sys.exit()

    if any([bu in ts_name for bu in ['Homes', 'Experiences', 'China']]):
        s_ut.my_print('ERROR: time series cannot be a BU time series: ' +
                      str(ts_name))
        sys.exit()

    data_cfg = os.path.expanduser(
        '~/my_repos/capacity_planning/forecast/config/ratio_forecast_cfg.json')
    if os.path.isfile(data_cfg):
        with open(data_cfg, 'r') as fptr:
            rf_dict = json.load(fptr)
    else:
        s_ut.my_print('ERROR: ' + data_cfg + ' file not found')
        sys.exit()

    d_date = rf_dict.get('data_date', None)
    if d_date is None:
        s_ut.my_print('ERROR: data_date cannot be null')
        sys.exit()
    data_date = pd.to_datetime(
        d_date)  # this is the cutoff date we get data from tables

    a_date = rf_dict.get(
        'adjust_date',
        None)  # if None, nothing to adjust and adj_date = data_date
    adjust_date = data_date if a_date is None else pd.to_datetime(
        a_date)  # this is the actual cutoff date

    window = rf_dict.get('ratio_windows', dict())
    if len(window) == 0:  # not set
        window = {
            'default': {
                'start': adjust_date - pd.to_timedelta(6, unit='W'),
                'end': adjust_date
            }
        }
    else:
        for k, v in window.items():
            for kk, vv in v.items():
                v[kk] = pd.to_datetime(vv)

    s_ut.my_print('************************* read table date: ' +
                  str(data_date.date()) +
                  ' ********************************************')
    s_ut.my_print('************************* write table date: ' +
                  str(adjust_date.date()) +
                  ' *******************************************')

    # ###############################
    # ###############################
    time_scale = 'W'
    init_date = pd.to_datetime('2016-01-01')
    # ###############################
    # ###############################

    df_tilde, bottom_ts = hts.main(
        ts_name, data_date, do_cov=True
    )  # coherent forecasts at language level + language level adjustments
    f_df = adj.main(
        df_tilde, 'language', bottom_ts, ts_name, adjust_date
    )  # must adjust at language level before service level ratios
    fr_list, fr_cols = list(), list()
    a_list = list()
    ts_list = bottom_ts  # ratios only on bottom_ts then aggregate to top TS
    for ts in ts_list:
        s_ut.my_print('============= starting ' + str(ts))
        ts_cfg, _ = dp.ts_setup(ts, data_date, init_date, time_scale)
        a_df = dp.ts_actuals(
            ts,
            ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'],
            drop_cols=False)
        b_df = filter_actuals(a_df, window)
        fr = ts_ratio(ts, b_df.copy(), f_df[['ds', 'language',
                                             ts + '_tilde']].copy(), window,
                      data_date)
        fr_list.append(fr)
        fr_cols.append(fr.columns)
        a_list.append(b_df)
        check_ratios(ts, b_df, fr, True, 'service_tier')
        check_ratios(ts, b_df, fr, False, 'service_tier')
        check_ratios(ts, b_df, fr, True, 'channel')
        check_ratios(ts, b_df, fr, False, 'channel')

    # must adjust together to ensure coherence
    fr = reduce(
        lambda x, y: x.merge(
            y, on=['ds', 'language', 'channel', 'service_tier'], how='outer'),
        fr_list) if len(fr_list) > 0 else None
    fr.fillna(0, inplace=True)
    for k_col in ['channel',
                  'service_tier']:  # language adj must be done before ratios
        fr = adj.main(fr, k_col, bottom_ts, ts_name, adjust_date)

    # save data
    f_list = list()
    for idx in range(len(ts_list)):
        ts = ts_list[idx]
        fx = fr[fr_cols[idx]].copy()
        fx.rename(columns={ts + '_tilde': 'yhat'}, inplace=True)
        fx['yhat'] = np.round(
            fx['yhat'].values,
            0)  # this makes input totals and output totals to be a bit off
        fx = fx[fx['yhat'] > 0]
        fx['ts_name'] = ts
        fx['cutoff'] = adjust_date
        f_list.append(fx)

    # get the aggregate series
    fall = pd.concat(f_list, axis=0)
    gall = fall.groupby(['ds', 'language', 'channel',
                         'service_tier']).sum(numeric_only=True).reset_index()
    gall['cutoff'] = adjust_date
    gall['ts_name'] = ts_name

    # align cols (ap gets confused otherwise?)
    tcols = [
        'ds', 'language', 'channel', 'service_tier', 'yhat', 'ts_name',
        'cutoff'
    ]
    fall = fall[tcols].copy()
    gall = gall[tcols].copy()

    # final DF to save
    fout = pd.concat([gall, fall], axis=0)
    p_ut.save_df(
        fout,
        '~/my_tmp/fbp/ratios_fcast_' + ts_name + '_' + str(adjust_date.date()))
    ts_cfg, _ = dp.ts_setup('ticket_count', data_date, init_date, time_scale)
    a_df = dp.ts_actuals(
        'ticket_count',
        ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'],
        drop_cols=False)
    p_ut.save_df(a_df, '~/my_tmp/a_df_ticket_count_' + str(adjust_date.date()))

    # data summary
    s_ut.my_print('**************** Data Summary *******************')
    for c in ['language', 'channel', 'service_tier']:
        s_ut.my_print('unique ' + c + ': ' + str(fout[c].unique()))

    # save to DB
    if to_table is True:
        tcols.remove('cutoff')
        tcols.remove('ts_name')
        tcols.insert(1, 'ds_week_starting')
        tcols.insert(2, 'fcst_date_inv_ending')  # ds_week_ending
        fout = fout[fout['ds'] > adjust_date.date()].copy(
        )  # only save forecasted values
        fout['ds_week_starting'] = fout['ds']
        fout['fcst_date_inv_ending'] = fout['ds'] + pd.to_timedelta(6,
                                                                    unit='D')
        for ts in fout['ts_name'].unique():
            partition = {'cutoff': str(adjust_date.date()), 'ts_name': ts}
            tb_df = fout[fout['ts_name'] == ts].copy()
            ret = hql.to_tble(tb_df, tcols, 'sup.cx_weekly_forecasts',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              ts_name + ' and cutoff date ' +
                              str(adjust_date.date()))
                sys.exit()
            else:
                s_ut.my_print(
                    '>>>>>>>>>>>>>>> SUCCESS: data saved to table <<<<<<<<<<<<<<<<<<<'
                )
    else:
        s_ut.my_print(
            '>>>>>>>>>>>>>>> WARNING: no data saved to table <<<<<<<<<<<<<<<<<<<'
        )
Esempio n. 4
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = os.path.expanduser('~/my_tmp/fbp/')
    evals = 250
    by_lang = False
    lwr, upr = 9, 12
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 3:
        ts_name, cutoff_date = argv[-2:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
            sys.exit()
    else:
        s_ut.my_print('ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' + str(argv))
        sys.exit()

    # data cfg
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W' and fcast_date.weekday() != 6:                 # set fcast date to week starting Sunday unless it is a Sunday already
        fcast_date = fcast_date - pd.to_timedelta(1 + fcast_date.weekday(), unit='D')

    s_ut.my_print('pid: ' + str(os.getpid()) + ' ------------------------ start ens forecast for ' + str(ts_name) + ' from cutoff date '
                  + str(cutoff_date.date()) + ' (excluded) to forecast date ' + str(fcast_date.date()) + '  (included) -----------------------')

    a_df = dp.ts_actuals(ts_name, ts_cfg, cols)                                  # get actuals
    a_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    fcast_df = fp.get_lang_fcast(ts_cfg, cutoff_date)      # get fcasts

    if fcast_df is not None and a_df is not None:
        s_ut.my_print(ts_name + ': combining ' + str(fcast_df['dim_cfg'].nunique()) + ' forecast configs')
        xens_ = get_ens(ts_name, cutoff_date)  # ts_cfg['ens'].get(str(cutoff_date.month), ens_dict['default'])

        s_ut.my_print('aggregation for ' + ts_name + ' done with ' + xens_)
        ts_fcast = ep.make_fcast(ts_name, fcast_df, a_df, cutoff_date, fcast_date, xens_, evals, by_lang, (lwr, upr), lwr=lwr, upr=upr)
        ts_fcast['fcast_date'] = fcast_date

        p_ut.save_df(ts_fcast, froot + 'ens_fcast_' + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            cols = ['ds', 'language', 'ens', 'yhat']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_cfg['ts_key']}
            ret = hql.to_tble(ts_fcast, cols, 'sup.cx_ens_forecast', partition)
            if ret != 0:
                s_ut.my_print('ERROR: DB write for ' + ts_name + ' ens forecast ' + ' at ' + str(cutoff_date.date()) + ' failed')
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('ERROR: no actuals or no data for errors of ' + ts_name + ' at ' + str(cutoff_date.date()))
Esempio n. 5
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 9
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv[1:]) == 1:
        ts_name = argv[-1]
        cutoff_date = pd.to_datetime('today')
        to_table = False
        n_features = 25
    elif len(argv[1:]) == 2:
        ts_name, cutoff_date = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
            n_features = 25
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv[1:]) == 3:
        ts_name, cutoff_date, n_features = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
            n_features = int(n_features)
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv[1:]) == 4:
        ts_name, cutoff_date, n_features, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            n_features = int(n_features)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()

    # set dates
    sun_date = cutoff_date - pd.to_timedelta(
        6, unit='D')  # cutoff date in week starting Sunday
    upr_date = sun_date + pd.to_timedelta(
        upr, unit=time_scale)  # horizon for the perf testing at cutoff_date
    lwr_date = sun_date + pd.to_timedelta(
        lwr, unit=time_scale)  # lwr date for perf testing window

    a_df, d_ff, cfg_dict = fp.cross_validation(ts_name,
                                               cutoff_date,
                                               upr,
                                               lwr,
                                               n_features,
                                               init_date=init_date,
                                               time_scale=time_scale)
    p_ut.save_df(d_ff, '~/my_tmp/d_ff')
    f_list = list()
    for lg, flf_ in d_ff.groupby('language'):
        flf_.drop('language', axis=1, inplace=True)
        flf_['y'] = flf_['y'].astype(float)
        flf_['yhat'] = flf_['yhat'].astype(float)
        a_perf = flf_[['dim_cfg', 'a_err']].drop_duplicates()

        flf = flf_[flf_['dim_cfg'].isin(cfg_dict[lg][0])].copy()
        flf.dropna(
            axis=1, inplace=True,
            how='all')  # drop all-null cols (fcast cfgs for other languages)
        p_flf = pd.pivot_table(flf[['ds', 'dim_cfg', 'yhat']].copy(),
                               index=['ds'],
                               columns=['dim_cfg'],
                               values=['yhat']).reset_index()
        cols = [str(c[1]) if c[0] == 'yhat' else c[0] for c in p_flf.columns]
        p_flf.columns = cols
        p_flf = p_flf.merge(flf[['ds', 'y', 'y_shifted',
                                 'adj_y_shifted']].drop_duplicates(),
                            on='ds',
                            how='left')

        s_ut.my_print(
            '\n\n+++++++++++++++++++++++++ starting aggregation for ' + lg +
            ' ++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
        d_list = ep.lang_perf(lg, p_flf, a_perf, cutoff_date, upr, lwr)
        pl = pd.DataFrame(d_list)
        pl['language'] = lg
        pl['n_forecasts'] = n_features
        f_list.append(pl)

    if len(f_list) > 0:
        pf = pd.concat(f_list, axis=0)
        pf['ts_name'] = ts_name
        pf['upr'] = upr_date
        pf['lwr'] = lwr_date
        pf['cutoff'] = cutoff_date
        print(pf)
        print('overall: ' + str(pf['avg_err'].mean()))
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = [
                'language', 'err', 'df', 'lwr', 'upr', 'train_cutoff', 'ens',
                'n_features', 'n_forecasts'
            ]
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance',
                              partition)
            if ret != 0:
                s_ut.my_print(
                    'pid: ' + str(os.getpid()) +
                    ' ERROR: forecast performance detail failed for ' +
                    ts_name + ' and cutoff date ' + str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print(
            'pid: ' + str(os.getpid()) +
            ' ERROR: no data for forecast performance detail failed for ' +
            ts_name + ' and cutoff date ' + str(cutoff_date.date()))
        sys.exit()
Esempio n. 6
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 8
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv) == 2:
        ts_name = argv[-2:]
        cutoff_date = pd.to_datetime('today')
        to_table = False
    elif len(argv) == 3:
        ts_name, cutoff_date = argv[-2:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    fcast_df = fp.get_ens_fcast(ts_name, ts_cfg, cutoff_date)
    ens = fcast_df.loc[fcast_df.index[0], 'ens']
    f_df = ep.fcast_filter(fcast_df, actuals_df, ts_name,
                           cutoff_date + pd.to_timedelta(upr, unit=time_scale),
                           cutoff_date, time_scale)
    pf = fcast_perf(f_df, actuals_df, cutoff_date, lwr, upr, time_scale, ens)
    if pf is None:
        return
    else:
        pf['ts_name'] = ts_name
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: forecast performance failed for ' +
                              ts_name + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
    print('DONE')