def get_actuals(cutoff_date_): fdir = os.path.expanduser('~/my_tmp/cleaned/') # '~/my_tmp/in_df_data_' adf = None for f in os.listdir(fdir): if str( cutoff_date_.date() ) in f and 'tickets_' in f and 'old' not in f: # 'in_df_data_' in f: # we do not know the rolling window s_ut.my_print('getting actuals from ' + fdir + f) adf = p_ut.read_df(fdir + f) break if adf is None: s_ut.my_print('no available actuals data for ' + str(cutoff_date_.date())) return None adf.reset_index(inplace=True, drop=True) p_ut.clean_cols(adf, ["language", "service_tier", "channel", "business_unit"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=False, do_nan=False, rename=True) adf.rename(columns={ 'ticket_count': 'y', 'ds_week_starting': 'ds' }, inplace=True) i_vals = ['nan', 'NULL', None, 'other', np.nan, 'null', 'N/A'] imp_data = imputer.impute(adf, i_vals=i_vals, ex_cols=['ds']) imp_data['y'] = np.round(imp_data['y'].values, 0) return imp_data
def tmp_ratios(cu, window, gcols): # tmp fix wf = pd.read_parquet('~/my_tmp/cleaned/old_tickets_2020-02-29.par') _ = p_ut.clean_cols( wf, ["language", "service_tier", "channel", "business_unit"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=True, do_nan=True, rename=True) wf.rename(columns={'ds_week_starting': 'ds'}, inplace=True) wf['channel'] = wf.apply( lambda x: 'directly' if x['service_tier'] == 'directly' else x['channel'], axis=1) i_vals = ['nan', 'NULL', None, 'other', np.nan, 'null', 'N/A'] wf['ds'] = wf['ds'].dt.date.astype(str) wf = imputer.impute(wf, i_vals=i_vals, ex_cols=['ds']) wf['ds'] = pd.to_datetime(wf['ds']) wf = wf[(wf['ds'] <= cu) & (wf['ds'] >= cu - pd.to_timedelta(window, unit='W'))].copy() wf['channel'] = wf.apply( lambda x: 'directly' if x['service_tier'] == 'directly' else x['channel'], axis=1) # again in case imputation added directly wrongly a_df = wf[wf['channel'] != 'directly'].copy() lct_df = a_df.groupby(gcols).sum(numeric_only=True).reset_index() l_df = lct_df.groupby(['language']).sum(numeric_only=True).reset_index() lct_ratio = lct_df.merge(l_df, on=['language'], how='left') lct_ratio['ratio'] = lct_ratio['ticket_count' + '_x'] / lct_ratio['ticket_count' + '_y'] lct_ratio.drop(['ticket_count' + '_x', 'ticket_count' + '_y'], axis=1, inplace=True) return lct_ratio
def __init__(self, file_path): s_ut.my_print('setting forecast from ' + file_path) t_info = file_path.split('.')[0].split('/')[-1] self.raw = True if 'raw' in t_info else False self.adj = not self.raw self.rolling = True if '_r_' in t_info else False self.cutoff_date = pd.to_datetime(t_info.split('_')[-1]) self.has_actuals = True if '_xls_' in t_info else False self.data = p_ut.read_df(file_path) p_ut.clean_cols( self.data, ["language", "service_tier", "channel", "business_unit"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=False, do_nan=False, rename=True) if 'ds_week_ending' in self.data.columns: self.data['ds'] = pd.to_datetime( self.data['ds_week_ending']) - pd.to_timedelta(6, unit='D') self.data.drop('ds_week_ending', inplace=True, axis=1) self.forecast = (self.cutoff_date + pd.to_timedelta(7, unit='D')).month_name() self.froot = file_path.split('.')[0][:-10]
] dfx.columns = l_cols + dr dfm = pd.melt(dfx, value_vars=dr, id_vars=l_cols, var_name='ds_week_ending', value_name='cx_yhat') start = pd.to_datetime(cutoff_date) + pd.to_timedelta( 7, unit='D') # cutoff is a week_ending date. Go to the next week dfm['ds_week_ending'] = pd.to_datetime(dfm['ds_week_ending'].values) dfm = dfm[(dfm['ds_week_ending'] >= start) & (dfm['ds_week_ending'] <= horizon_date)].copy() _ = p_ut.clean_cols( dfm, ['sector', 'language'], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=False) dfm['cx_yhat'] = dfm['cx_yhat'].apply(lambda x: to_float(x)) p_df = pd.pivot_table(dfm, index=['ds_week_ending', 'language', 'sector'], values='cx_yhat', columns=['type']).reset_index() c_df = p_df[p_df['sector'].isin([ 'Claims', 'Community Education', 'Experiences', 'PST', 'Payments', 'Regulatory Response', 'Resolutions 1', 'Resolutions 2', 'Safety' ])].copy() c_df.fillna(0, inplace=True) # language level: no language level agg for FTE if 'inbound-vol' in ts_name: g_df = c_df.groupby(['ds_week_ending', 'language']).agg({
end = time.time() s_ut.my_print('imputer: reduce secs: ' + str(end - start)) return z_all if __name__ == '__main__': # df = pd.DataFrame({ # 'a': [1,2,np.nan, 4, 5, 6, 7], # 'b': ['x', 'y', None, None, 'z', 'x', 'z'], # 'c': [1.5, 2.3, 5.2, 3, None, np.nan, 5.2], # 'ds': ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'], # 'd': [None, None, 'a', 'bb', 'a', 'a', 'bb']} # ) # ex_cols_ = ['ds'] df = pd.read_parquet('~/my_tmp/phone-aht_2019-11-23.par') from capacity_planning.utilities import pandas_utils as p_ut new_vals = p_ut.clean_cols( df, ["service_region", "language", "sector", "interaction_type"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=True, do_nan=True) ex_cols_ = ['ds', 'agent_id'] df['tenure_days'] = df['tenure_days'].apply(lambda x: x if x > 0 else np.nan) # df = df[df['ds'] >= '2019-06-01'].copy() print(df.head(10)) zz = impute(df, ex_cols=ex_cols_) print(zz.head(10)) print('DONE')
# avg_fdf['initiative'] = False # s_ut.my_print('saving raw fcast data to ' + fcast_f) # p_ut.save_df(avg_fdf, fcast_f) # ONLY forecasted data # accuracy for the <months> months old raw forecast adf = errs.get_actuals(cutoff_date) # raw actuals up to cutoff_date window = 4 fcast_file = errs.get_fcast_file( cutoff_date, '~/Forecasts/rolling/par/raw_r_fcast_' + str(window) + '_', months=3) # file path from <months> months old forecast from cutoff fdf_obj = ts.TicketForecast(fcast_file) # fcast obj from 3 months ago fdf = fdf_obj.data p_ut.clean_cols(fdf, ["language", "service_tier", "channel", "business_unit"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=False, do_nan=False, rename=True) fdf.rename(columns={'ticket_count': 'forecasted_count'}, inplace=True) # fdf['ds_week_ending'] = pd.to_datetime(fdf['ds_week_ending']) # fdf['ds'] = fdf['ds_week_ending'] - pd.to_timedelta(6, unit='D') if fdf is None or adf is None: lang_errs, tier_errs = None, None else: s_ut.my_print('Error wrt actuals for an old forecast') lang_errs, tier_errs, off_df = errs.get_errs( cutoff_date, fdf_obj, adf, tcol='ds') # errs on filtered actuals for old raw fcast lang_errs['adj'] = False lang_errs['initiative'] = False tier_errs['adj'] = False