def set_bounds(self): if self.mdl == 'ErlangA': m_bounds = (max(1, self.a / self.window), self.window * self.a ) # overwrite bounds elif self.mdl == 'ErlangC': try: self.min_mval = int(np.ceil(self.a)) if self.min_mval == int(self.a): self.min_mval += 1 m_bounds = (self.min_mval, self.window * self.a ) # bounds for servers in the minimization except ValueError: s_ut.my_print('ERROR_:erlang_tools:ErlangC model invalid::' + self.__str__()) m_bounds = None elif self.mdl == 'ErlangB': m_bounds = (max(1, self.a / self.window), self.window * self.a ) # bounds for servers in the minimization else: s_ut.my_print('@@@@@@@ invalid model @@@@@@@: ' + str(self.mdl)) m_bounds = None try: self.m_bounds = ( int(np.floor(m_bounds[0])), int(np.ceil(m_bounds[1]))) if m_bounds is not None else None except ValueError: self.m_bounds = None
def sla_func(self, m=None, err=False): # actual_sla - target_sla: if positive, we are meeting sla # find smallest m such that actual_sla >= target_sla # err: True return relative error wrt target prob if m is None: m = self.get_servers() if self.mval is None else self.mval if self.verbose is True: s_ut.my_print( 'WARNING: no value set for servers. Setting to default:: ' + str(m)) q, t = self.func_args[self.sla_func.__name__] if m < self.min_mval: s_ut.my_print( 'WARNING: ' + self.mdl + ' sla_func could be unstable because m is too small: ' + str(m) + ' and min m: ' + str(self.min_mval)) y = -q # prob(Wait < t) = 0 else: y = self.queueing_mdl(m).sla_prob(t) - q # s_ut.my_print('m: ' + str(m) + ' q: ' + str(q) + ' t: ' + str(t) + ' prob: ' + str(self.queueing_mdl(m).sla_prob(t)) + ' ret: ' + str(y)) # print('sla: ' + str(m) + ' ' + str(q) + ' ' + str(self.queueing_mdl(m).sla_prob(t, use_log=False))) if err is False: return y else: return y / q
def get_data_file(f_all, cutoff_date): if f_all is None: return None cutoff_date = pd.to_datetime(cutoff_date) vf = f_all.split('/') d_dir = os.path.expanduser(os.path.dirname('/'.join(vf))) fname = vf[-1] dt_max, fn = pd.to_datetime('2016-01-01'), None for f in os.listdir(d_dir): if fname in f: try: f_date = pd.to_datetime(f.split('.')[0][-10:]) except ValueError: s_ut.my_print('WARNING invalid date in ' + str(f) + ' cutoff date: ' + str(cutoff_date.date()) + ' d_dir: ' + str(d_dir)) continue if f_date >= cutoff_date: # file is acceptable if fn is None: # always update if fn is None dt_max, fn = f_date, f elif dt_max < f_date: # update to a cleaned version if current is cleaned but older dt_max, fn = f_date, f else: pass if fn is None: return None else: return os.path.join(d_dir, fn) # return the latest file, cleaned if possible
def get_all_files(f_root, cutoff_date, post_cutoff): # all files before cutoff date included # f_all is the file path + file pattern # f_root: ~/my_tmp/fbp/lang_fcast_bookings if f_root is None: return None cutoff_date = pd.to_datetime(cutoff_date) d_dir = os.path.expanduser(os.path.dirname(f_root)) f_name = f_root.split('/')[-1] lf_out = list() for f in os.listdir(d_dir): if f_name in f: f_base, f_ext = os.path.splitext(f) try: f_date = pd.to_datetime(f_base.split('_')[-1]) except ValueError: s_ut.my_print('WARNING invalid date in ' + str(f) + ' cutoff date: ' + str(cutoff_date.date()) + ' d_dir: ' + str(d_dir)) continue if post_cutoff is True and f_date >= cutoff_date: # file is acceptable lf_out.append(os.path.join(d_dir, f)) if post_cutoff is False and f_date <= cutoff_date: # file is acceptable lf_out.append(os.path.join(d_dir, f)) return lf_out
def ab_func(adb_estimators, max_depth, r, s, lf, X_train, y_train, X_test, y_test, y_perf, topN_list): ab_reg = AdaBoostRegressor(n_estimators=adb_estimators, base_estimator=DecisionTreeRegressor( max_depth=max_depth, min_samples_split=s), loss=lf, learning_rate=r) try: ab_reg.fit(X_train, y_train) except ValueError as e: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: ' + str(e)) s_ut.save_df( pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=0), '~/my_tmp/ab_func_err') return [dict()] yhat_test = ab_reg.predict(X_test) d_list = list() d_cfg = { 'adb_estimators': adb_estimators, 'max_depth': max_depth, 'learning_rate': r, 'loss': lf, 'min_samples_split': s } # for each AdaBoost cfg and topN get the the values of all the loss functions for nval in topN_list: d_ = copy.deepcopy(d_cfg) d_loss = loss_func(y_test, yhat_test, nval, w=y_perf) d_.update(d_loss) d_list.append(d_) return d_list
def get_holidays(self, lang): # holidays per language prefix = 'not-' if 'not-' in lang else '' language = prefix + 'Mandarin' if 'Mandarin' in lang else lang end_year = self.fcast_date.year holidays_df = hdays.get_hols( language, end_year ) # returns None if language not valid (e.g. language = foo), returns all languages if language = None if self.time_scale == 'D': pass elif self.time_scale == 'W': gcols = ['language', pd.Grouper(key='ds', freq=self.time_scale) ] if 'language' in holidays_df.columns else pd.Grouper( key='ds', freq=self.time_scale) holidays_df = holidays_df.groupby(gcols).apply( self.w_hols).reset_index() holidays_df.drop('level_2', axis=1, inplace=True) else: s_ut.my_print('ERROR: invalid time scale: ' + str(self.time_scale)) sys.exit() # set the right time scale for holidays if holidays_df is None: s_ut.my_print('WARNING: no holidays DF for ' + language) return None else: holidays_df.drop('language', axis=1, inplace=True) return holidays_df[(holidays_df['ds'] <= self.fcast_date) & (holidays_df['ds'] >= self.init_date)]
def __init__(self, name, data, cutoff_date, upr, lwr, mask, loss_type='rel', max_evals=200, verbose=False): super().__init__(name, data, cutoff_date, upr, lwr) self.max_evals = max_evals self.loss_type = loss_type if loss_type in ['abs', 'rel'] else 'rel' self.iter, self.string = 0, '' if mask is not None and len(mask) > 0: self.X_train = self.X_train[:, mask] self.X_test = self.X_test[:, mask] self.mask_ = mask self.space = SPACE_DICT[self.name] self.params, self.loss, self.mdl = None, None, None try: self.rfunc = getattr(sk_ens, name) except AttributeError: try: self.rfunc = getattr(xgb, name) except AttributeError: s_ut.my_print('ERROR: ' + name + ' not found') self.regr_opt() self.regr_set() if verbose: print(self.string) s_ut.my_print(self.name + ': ' + str(self.params) + ' loss: ' + str(self.loss) + ' n_features: ' + str(len(self.features_)))
def __init__(self, name, f_data, this_cu, upr, lwr, max_evals=1000, verbose=False): super().__init__(name, f_data, this_cu, upr, lwr) self.max_evals = max_evals self.verbose = verbose self.iter, self.valid_iter, self.string, self.min_loss = 0, 0, '', np.inf self.params, self.loss, self.mdl, self.df, self.l1_ratio = None, None, None, None, None self.do_test = True self.space_list = self.get_paths() self.regr_opt() # find opt pars at init time if self.params is not None: self.alpha = self.params['alpha'] self.lbda = self.params['l1_ratio'] self.regr_set() if self.params is not None: self.ridge_par = 2.0 * len(self.y_train) * self.params['alpha'] * ( 1 - self.params['l1_ratio']) else: s_ut.my_print( 'WARNING: default ridge parameter <<<<<<<<<<<<<<<<<<<<<<<< ') self.ridge_par = 1.0 s_ut.my_print('enet: ' + str(self.params) + ' loss: ' + str(self.loss) + ' df: ' + str(self.df) + ' n_features: ' + str(len(self.features_)))
def data_check(df, name): df.replace([np.inf, -np.inf], np.nan, inplace=True) u_vals = [c for c in df.columns if df[c].nunique() <= 1] if df.isnull().sum().sum() > 0 or len(u_vals) > 0: p_ut.save_df(df, '~/my_tmp/f_data') s_ut.my_print('ERROR: invalid data for ' + str(name)) sys.exit()
def has_m(self): m = self.get_servers() if self.mval is None else self.mval if self.verbose is True: s_ut.my_print( 'WARNING: no value set for servers. Setting to default: ' + str(m)) return m
def bxform_df(xform_obj, f_list): out_list = list() for f in f_list: y_var = xform_obj.fcast_var( f[['yhat_lower', 'yhat_upper']].copy(), PROPHET_DICT['prophet_dict']['interval_width']) for c in [ 'yhat', 'yhat_upper', 'yhat_lower', 'trend', 'trend_upper', 'trend_lower', 'additive_terms', 'additive_terms_lower', 'additive_terms_upper', 'multiplicative_terms', 'multiplicative_terms_lower', 'multiplicative_terms_upper' ]: f[c] = xform_obj.inverse_transform(f[c].values, y_var, lbl=c) for c in ['yhat']: # , 'yhat_upper', 'yhat_lower']: if f[c].isnull().sum() > 0: s_ut.my_print( 'pid: ' + str(os.getpid()) + ' WARNING: nulls in back-transformed values for ' + str(c) + ' Ignoring this forecast cfg') break else: out_list.append(f) s_ut.my_print('pid: ' + str(os.getpid()) + ' there is ' + str(len(out_list)) + ' valid fcast cfgs') return None if len(out_list) == 0 else pd.concat(out_list, axis=0)
def get_actuals(ts_dict, gcols, use_cache=None): # actuals with a max ds >= cutoff_date cutoff_date = ts_dict['cutoff_date'] init_date = ts_dict['init_date'] ts_name = ts_dict['name'] ycol = ts_dict['ycol'] s_ut.my_print('getting ' + ts_name + ' actuals from table') r_date = hql.get_rmax(ycol, use_cache=USE_CACHE) qcols = list(set(['ds', 'language', 'y'] + gcols)) col_str = ','.join(qcols) print('rmax: ' + str(r_date)) qry = 'select ' + col_str + ' from sup.cx_weekly_actuals where ts_name=\'' + ycol + '\' and run_date=\'' + r_date + '\';' try: uc = USE_CACHE if use_cache is None else use_cache df = hql.from_tble(qry, ['ds'], use_cache=uc, renew=RENEW) s_ut.my_print(qry + ' completed. Got ' + str(len(df)) + ' rows') except: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: query: ' + qry + ' failed. No data for ts ' + ts_name) sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' got actuals for ' + ts_name + ' from table ' + 'sup.cx_weekly_actuals') df.rename(columns={'y': ycol}, inplace=True) # unique name needed (may mix with regressors later) df = df[(df['ds'] >= init_date)].copy() if df['ds'].max() < cutoff_date: s_ut.my_print('ERROR: no actuals up to cutoff date for ' + ts_name) return None else: if len(df) > 0: return df.groupby(['ds'] + gcols).sum().reset_index() else: return None
def log_trick(arr): # 1. if Y = a1 + a2 + ... , then the log_trick input is an array of the form arr = [[log(|a1|), sign(a1)], [log(|a2|), sign(a2)]], ] ... # 3. returns [log(|Y|), sign(Y)] if isinstance(arr, list) or isinstance(arr, np.ndarray) or isinstance( arr, tuple): if len(arr) > 0: try: vals = [x[0] for x in arr] b = [x[1] for x in arr] return list(lsexp(vals, b=b, return_sign=True)) except TypeError: s_ut.my_print('log_trick: invalid input format: ' + str(arr)) return None else: # empty array return [-np.inf, 0] else: # if not list, assume float try: varr = float(arr) sgn = np.sign(varr) if sgn == 0: return [-np.inf, 0] else: return [np.log(sgn * varr), sgn] except ValueError: s_ut.my_print('log_trick::invalid input::' + str(arr)) return None
def get_f_cfg(cfg_dict, cutoff_date, init_date, time_scale): # set the training windows in multiples of year if time_scale == 'W': periods = (cutoff_date - init_date).days / 7 - 1 periods = int(np.ceil(periods)) nperiods = np.floor(periods / 52.25) cfg_dict['training'] = [ 52 * p + 1 for p in range(1, int(nperiods) + 1) ] elif time_scale == 'D': periods = (cutoff_date - init_date).days - 1 nperiods = np.floor(periods / 365.25) cfg_dict['training'] = [ 365 * p + 1 for p in range(1, int(nperiods) + 1) ] else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: unsupported time scale: ' + str(time_scale)) sys.exit() if time_scale == 'W': cfg_dict['w_mode'] = [None] v_list = list(cfg_dict.values()) k_list = list(cfg_dict.keys()) f_list = list(itertools.product(*v_list)) d_list = [{k_list[i]: x[i] for i in range(len(k_list))} for x in f_list] return d_list
def fcast_adj(k_col, adjust_date): # k_col: key columns to be adjusted # forecast adjuster (engineering, ECs, ...) # ds is the start date of an adj factor for a given k_col and ts_name # at the last adj time all adj factors must be 1, otherwise we overwrite # All missing adj factors fill to 1 # All missing k_col values at a given ds get an adj factor or 1 if adjust_date is None: return list() else: data_cfg = os.path.expanduser( '~/my_repos/capacity_planning/forecast/config/fcast_adjust_' + str(adjust_date.date()) + '.json') if os.path.isfile(data_cfg): with open(data_cfg, 'r') as fptr: adj_dict = json.load(fptr) else: s_ut.my_print( '>>>>>>>>>>>>>>>> WARNING: could not find adjustments file ' + data_cfg + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return list() # find dicts in adj_dict that contain k_col adj_names = list() for k, vlist in adj_dict.items(): for v in vlist: if k_col in v.keys(): adj_names.append(k) break # check adj cfg adj_df_list = list() for adj_name in adj_names: # list of dicts in adj_dict that contain k_col adj_list = adj_dict[adj_name] adj_df_ = pd.DataFrame(adj_list) adj_df_['ds'] = pd.to_datetime(adj_df_['ds'].values) # add missing entries a ds ts_cols = [c for c in adj_df_.columns if c not in ['ds', k_col]] f_list = [adj_df_] v_list = adj_df_[k_col].unique() for ds, f in adj_df_.groupby('ds'): m_vals = list(set(v_list) - set(f[k_col].unique())) if len(m_vals) > 0: lf = pd.DataFrame({'ds': [ds] * len(m_vals), k_col: m_vals}) f_list.append(lf) adj_df = pd.concat(f_list, axis=0, sort=True) adj_df[ts_cols].fillna(1, inplace=True) # default is not adjust # check latest date has all 1's b = adj_df['ds'] == adj_df['ds'].max() d_max = adj_df[b].copy() if d_max[ts_cols].min().min() != 1 or d_max[ts_cols].max().max() != 1: s_ut.my_print('WARNING: last adjust values not 1. Resetting') d_max[ts_cols] = 1 adj_df = pd.concat([adj_df[~b].copy(), d_max], axis=0) fa = adj_df.reset_index(drop=True) fa.fillna(1.0, inplace=True) adj_df_list.append(fa) return adj_df_list
def fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days, init_date): # set the arg_list for regressor forecast reg_df_list_, reg_df_list, r_cols = list(), list(), list() for f in rf_list: if f is not None: if 'language' not in f.columns: f['language'] = 'NULL' if len(f) > 0: reg_df_list_.append(f.reset_index( drop=True)) # all reg_df have a language column else: s_ut.my_print('WARNING: regressor ' + str(f.columns) + ' has not data') reg_df_list = reg_gap_check(reg_df_list_, cutoff_date, init_date, fcast_days) # check for gaps do_fcast = { list(v['r_col'].keys())[0]: v['do_fcast'] for v in reg_cfg.values() } arg_list, rcol_list = list(), list() for f in reg_df_list: # list of regs by reg col and language f_cols = [c for c in f.columns if c != 'language'] rcol = [c for c in f_cols if c not in ['ds', 'ceiling', 'floor']] rcol_list += rcol lang = f.loc[f.index[0], 'language'] if len(f) > 0 and len(rcol) == 1: arg_list.append( [lang, f, 'ds', rcol[0], cutoff_date, fcast_days, do_fcast]) else: s_ut.my_print( 'WARNING::empty regressor or too many regression columns: ' + str(rcol) + ' language" ' + str(lang) + ' len: ' + str(len(f))) rcol_list = list(set(rcol_list)) return arg_list, rcol_list
def perf_smry(perf_df, cutoff_date, time_scale, ts_name, upr, lwr): # print smry and save upr_horizon = cutoff_date + pd.to_timedelta(upr, unit=time_scale) lwr_horizon = cutoff_date + pd.to_timedelta(lwr, unit=time_scale) if perf_df is not None: perf_df.sort_values(by='language', inplace=True) perf_df.reset_index(inplace=True, drop=True) perf_df['ts_name'] = ts_name perf_df['cutoff'] = cutoff_date s_ut.my_print( '########################### cutoff: ' + str(cutoff_date.date()) + ' ts_name: ' + str(ts_name) + ' performance between ' + str(lwr_horizon.date()) + ' (included) and ' + str(upr_horizon.date()) + ' (included) ##########################################') perf_df.sort_values(by=['language', 'err'], inplace=True) print(perf_df.head(10)) p_ut.save_df( perf_df, '~/my_tmp/fbp/lang_perf_' + ts_name + '_' + str(cutoff_date.date())) else: s_ut.my_print( 'WARNING: no actuals to compute fcast errors for the between ' + str(lwr_horizon.date()) + ' (included) and ' + str(upr_horizon.date()) + ' (included) ' + ' for cutoff: ' + str(cutoff_date.date()) + ' and ts_name: ' + str(ts_name))
def __init__(self, method, nqs, ceiling=None, floor=None, unbias=False): self.method = method self.ceiling = ceiling self.floor = floor self.lmbda = None self.name = method self.xf_done = False self.unbias = unbias # not implemented self.lbl = None if method == 'yeo-johnson' or method == 'box-cox': self.xobj = PowerTransformer( method=method, standardize=False, copy=False) # MUST have standardize = False elif method == 'quantile': self.xobj = QuantileTransformer(n_quantiles=int(nqs), output_distribution='normal', copy=False) elif method == 'logistic': self.xobj = Linearizer(ceiling, floor, self.unbias) elif method == 'log': self.xobj = LogTransform(self.unbias) elif method == 'anscombe': self.xobj = Anscombe() elif method is None: self.method = None self.xobj = NoTransform() else: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: set_xform: invalid method: ' + str(method)) self.method = None self.xobj = NoTransform()
def get_actuals(cutoff_date_): fdir = os.path.expanduser('~/my_tmp/cleaned/') # '~/my_tmp/in_df_data_' adf = None for f in os.listdir(fdir): if str( cutoff_date_.date() ) in f and 'tickets_' in f and 'old' not in f: # 'in_df_data_' in f: # we do not know the rolling window s_ut.my_print('getting actuals from ' + fdir + f) adf = p_ut.read_df(fdir + f) break if adf is None: s_ut.my_print('no available actuals data for ' + str(cutoff_date_.date())) return None adf.reset_index(inplace=True, drop=True) p_ut.clean_cols(adf, ["language", "service_tier", "channel", "business_unit"], '~/my_repos/capacity_planning/data/config/col_values.json', check_new=False, do_nan=False, rename=True) adf.rename(columns={ 'ticket_count': 'y', 'ds_week_starting': 'ds' }, inplace=True) i_vals = ['nan', 'NULL', None, 'other', np.nan, 'null', 'N/A'] imp_data = imputer.impute(adf, i_vals=i_vals, ex_cols=['ds']) imp_data['y'] = np.round(imp_data['y'].values, 0) return imp_data
def interpolate_(self, y, yt, nan_pct=0.2): # y: inverse-transformed values (values in natural scale) # yt: pre-inverse transform (values in transformed scale) if y is None: return None else: yx = np.reshape(y, (1, -1))[0] if self.method is not None else y nulls = pd.Series(yx).isnull().sum() pct = 100.0 * np.round(nulls / len(yx), 2) if nulls > nan_pct * np.ceil(len(yx)): su.my_print('WARNING: Too many NaN to interpolate for label ' + str(self.lbl) + ': ' + str(nulls) + ' out of ' + str(len(yx)) + ' (' + str(pct) + '%) data points and lambda ' + str(self.lmbda)) f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)}) f['lmbda'] = self.lmbda p_ut.save_df(f, '~/my_tmp/interpolDF') return None elif 0 < nulls <= nan_pct * np.ceil( len(yx)): # interpolate yhat if some NaNs su.my_print('WARNING: interpolating for label ' + str(self.lbl) + ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) + ' data points (' + str(pct) + '%)') st = pd.Series(yx) sint = st.interpolate(limit_direction='both') yhat = sint.values ys = np.reshape(yhat, (1, -1)) return ys[0] else: # all OK return y
def to_table(to_db, table, cutoff_date, ts_name, if_exists, df_out): cu_dt = str(cutoff_date.date()) df_out['cutoff'] = cutoff_date df_out['ts_name'] = ts_name file_out = p_ut.save_df(df_out, '~/my_tmp/fcast_df_' + cu_dt + '_' + ts_name) if to_db is True: partition = {'cutoff': cu_dt, 'ts_name': ts_name} df_out['ds'] = df_out['ds'].dt.date.astype(str) df_out.drop(['cutoff', 'ts_name'], axis=1, inplace=True) s_ut.my_print('Loading data to ' + table + ' for partition: ' + str(partition)) try: # presto does not work with a partition argument ap.hive.push(df_out, table=table, if_exists=if_exists, partition=partition, table_props={ 'abb_retention_days': '-1', 'abb_retention_days_reason': 'fact table. No pii' }) except: s_ut.my_print('ERROR: push to ' + table + ' failed for partition: ' + str(partition)) sys.exit() return file_out
def __init__(self, lbda, mu, m, verbose=False): super().__init__(lbda, mu, m, 0.0, verbose) if self.m < 1 or self.lbda <= 0 or self.mu <= 0 or self.m <= self.a: s_ut.my_print('ErlangC: WARNING: invalid parameters: ' + self.__str__()) self.pars_ok = False self.m_arr = np.array(self.m) if isinstance(self.m, (list, np.ndarray)) else self.m self.a_arr = np.array(self.a) if isinstance(self.a, (list, np.ndarray)) else self.a
def reg_gap_check(reg_df_list_, cutoff_date, init_date, fcast_days): # check for gaps reg_df_list = list() for r in reg_df_list_: for l in r.language.unique(): rl = r[r['language'] == l].copy() pdict = dict() pdict['language'] = l for c in ['ceiling', 'floor']: if c in rl.columns: pdict[c] = rl.loc[rl.index[0], c] rl.drop(c, axis=1, inplace=True) c = [c_ for c_ in rl.columns if c_ != 'ds' and c_ != 'language'][0] rl = d_proc.data_check(rl[['ds', c]].copy(), c, 'ds', cutoff_date, init_date, max_int=5, name=l) if rl is not None: # add back language, ceiling and floor for k, v in pdict.items(): rl[k] = v reg_df_list.append(rl) else: s_ut.my_print('WARNING: regressor ' + str(c) + ' language: ' + l + ' failed data check') return reg_df_list
def _get_dcmdl( in_mdl, size, em_phases=1, max_splits=2 ): # fit arr_v to em_phases of exponentials (partial models) arr_v = in_mdl.rvs(size=size) dc_em = sut.HyperExp(arr_v, em_phases=em_phases, max_splits=max_splits, floc=0.0) dc_em.fit() if dc_em.fit_res is None: return None else: v_out = np.array([(d['prob'], 1.0 / d['params'][-1]) for d in dc_em.fit_res]) # (prob, rate) if dc_em.m_err > 0.25 and dc_em.s_err > 0.5: v_out = np.array([(1.0, 1.0 / in_mdl.avg) ]) # replace by plain exponential s_ut.my_print('pid: ' + str(os.getpid()) + ' Poor fit for input model: ' + in_mdl.__str__() + ' replacing by exponential: ' + str(v_out)) return v_out, dc_em.em_mean, dc_em.em_std
def __init__(self, run_id, d_cfg): self.run_id = '' # run_id self.d_cfg = d_cfg self.worker_hosts = {} self.repo_path = d_cfg.get('repo_path', None) self.is_ap = True if platform.system() == 'Darwin' else False # run with airpy from laptop and on cli from redspot if self.repo_path is None: s_ut.my_print('ERROR_: repo path is missing')
def check_output(self, ya, y): if len(np.unique(ya)) == 1: # xform failed: over/underflow? su.my_print(' WARNING: transform ' + self.method + ' failed with lambda ' + str(self.lmbda) + ' and label: ' + str(self.lbl) + ' Trying Quantile') return self.reset_xform(y) else: return ya
def fit_transform(self, y): if self.xf_done is True: su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method + ' : already fit. Create new Transform instance') return None else: r = self.fit(y) return None if r is None else self.transform(y)
def solver(d_cfg): solver_obj = SimSolver(d_cfg) if solver_obj.is_valid: return solver_obj.get_dcmdl() else: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: invalid models. Cannot solve') return None
def inverse_transform(self, y, y_var, lbl=None): self.lbl = lbl if y is not None: if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)): y = y.values else: return None if y_var is not None: if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)): y_var = y_var.values if isinstance(y, np.ndarray) is False: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: invalid type: ' + str(type(y))) return None if self.xf_done is False: su.my_print( 'pid: ' + str(os.getpid()) + ' WARNING: cannot inverse_transform before fit is done') return None yc = copy.deepcopy(y) if self.method == 'logistic': yt = self.xobj.inverse_transform(y, y_var, lbl=lbl) yt = self.interpolate_(yt, yc, nan_pct=0.2) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method)) return None else: return yt elif self.method == 'log': yt = self.xobj.inverse_transform(y, y_var, lbl=lbl) yt = self.interpolate_(yt, yc, nan_pct=0.2) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method)) return None else: return yt elif self.method is None: return y else: # box-cox, yj yt = self._inverse_transform(y, yc, y_var) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method) + ' and lambda: ' + str(self.lmbda) + ')') return None else: yout = np.reshape(yt, (1, -1))[0] if self.method is not None else y return yout
def get_emdl(self): # set basic analytical server counts and avg in system self.ss_obj = self._set_emdl(self.lbda, self.mu, self.theta, self.sla_dict, self.verbose) self.min_servers = self.ss_obj.min_mval self.m_mdl = self.ss_obj.get_servers() if self.m_mdl is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: analytical model failed') return