def mlmc_gen(N0, eps, fun_gen, num_levs): import numpy as np import grama as gr # generate models [models, costs] = md_gen(fun_gen, num_levs) # from timeit import default_timer as timer # costs = list() # for i in range(num_levs): # start = timer() # models[i] >> gr.ev_monte_carlo(10) # end = timer() # md_cost = end - start # costs.append(md_cost) # print("costs: ",costs) # print(models) its = 0 # initialize iteration counter Nlev = np.zeros(num_levs) # samples taken per level (initialize) dNlev = N0 * np.ones( num_levs) # samples left to take per level (initialize) sumlev = np.zeros((2, num_levs)) # sample results per level (initialize) costlev = np.zeros(num_levs) # total cost per level (initialize) while np.sum(dNlev) > 0: # check if there are samples left to be evaluated for lev in range(num_levs): if dNlev[lev] > 0: # check if there are samples to be evaluated on level 'lev' df_mc_lev = models[lev] >> gr.ev_monte_carlo(dNlev[lev]) if lev > 0: df_prev = df_mc_lev >> gr.tf_select(gr.starts_with("x")) df_mc_lev_prev = models[lev - 1] >> gr.ev_df(df_prev) Y = df_mc_lev.P - df_mc_lev_prev.P cost = (costs[lev] + costs[lev - 1]) * dNlev[lev] else: Y = df_mc_lev.P cost = costs[lev] * dNlev[lev] sums = [Y.sum(), (Y**2).sum()] Nlev[lev] = Nlev[lev] + dNlev[ lev] # update samples taken on level 'lev' sumlev[0, lev] = sumlev[0, lev] + sums[ 0] # update sample results on level 'lev' sumlev[1, lev] = sumlev[1, lev] + sums[ 1] # update sample results on level 'lev' costlev[lev] = costlev[ lev] + cost # update total cost on level 'lev' mlev = np.abs(sumlev[0, :] / Nlev) # expected value per level Vlev = np.maximum( 0, (sumlev[1, :] / Nlev - mlev**2)) # variance per level Clev = costlev / Nlev # cost per result per level mu = eps**(-2) * sum(np.sqrt( Vlev * Clev)) # Lagrange multiplier to minimize variance for a fixed cost Ns = np.ceil( mu * np.sqrt(Vlev / Clev)) # optimal number of samples per level dNlev = np.maximum(0, Ns - Nlev) # update samples left to take per level its += 1 # update counter P = np.sum(sumlev[0, :] / Nlev) # evaluate two-level estimator return P, Nlev, Vlev, its
def tran_kfolds( df, k=None, ft=None, out=None, var_fold=None, suffix="_mean", summaries=None, tf=tf_summarize, shuffle=True, seed=None, ): r"""Perform k-fold CV Perform k-fold cross-validation (CV) using a given fitting procedure (ft). Optionally provide a fold identifier column, or (randomly) assign folds. Args: df (DataFrame): Data to pass to given fitting procedure ft (gr.ft_): Partially-evaluated grama fit function; defines model fitting procedure and outputs to aggregate tf (gr.tf_): Partially-evaluated grama transform function; evaluation of fitted model will be passed to tf and provided with keyword arguments from summaries out (list or None): Outputs for which to compute `summaries`; None uses ft.out var_fold (str or None): Column to treat as fold identifier; overrides `k` suffix (str): Suffix for predicted value; used to distinguish between predicted and actual summaries (dict of functions): Summary functions to pass to tf; will be evaluated for outputs of ft. Each summary must have signature summary(f_pred, f_meas). Grama includes builtin options: gr.mse, gr.rmse, gr.rel_mse, gr.rsq, gr.ndme k (int): Number of folds; k=5 to k=10 recommended [1] shuffle (bool): Shuffle the data before CV? True recommended [1] Notes: - Many grama functions support *partial evaluation*; this allows one to specify things like hyperparameters in fitting functions without providing data and executing the fit. You can take advantage of this functionality to easly do hyperparameter studies. Returns: DataFrame: Aggregated results within each of k-folds using given model and summary transform References: [1] James, Witten, Hastie, and Tibshirani, "An introduction to statistical learning" (2017), Chapter 5. Resampling Methods Examples: >>> import grama as gr >>> from grama.data import df_stang >>> from grama.fit import ft_rf >>> df_kfolds = ( >>> df_stang >>> >> gr.tf_kfolds( >>> k=5, >>> ft=ft_rf(out=["thick"], var=["E", "mu"]), >>> ) """ ## Check invariants if ft is None: raise ValueError("Must provide ft keyword argument") if (k is None) and (var_fold is None): print("... tran_kfolds is using default k=5") k = 5 if summaries is None: print("... tran_kfolds is using default summaries mse and rsq") summaries = dict(mse=mse, rsq=rsq) n = df.shape[0] ## Handle custom folds if not (var_fold is None): ## Check for a valid var_fold if not (var_fold in df.columns): raise ValueError("var_fold must be in df.columns or None") ## Build folds levels = unique(df[var_fold]) k = len(levels) print("... tran_kfolds found {} levels via var_folds".format(k)) Is = [] for l in levels: Is.append(list(arange(n)[df[var_fold] == l])) else: ## Shuffle data indices if shuffle: if seed: set_seed(seed) I = permutation(n) else: I = arange(n) ## Build folds di = int(ceil(n / k)) Is = [I[i * di:min((i + 1) * di, n)] for i in range(k)] ## Iterate over folds df_res = DataFrame() for i in range(k): ## Train by out-of-fold data md_fit = df >> tf_filter(~var_in(X.index, Is[i])) >> ft ## Determine predicted and actual if out is None: out = str_replace(md_fit.out, suffix, "") else: out = str_replace(out, suffix, "") ## Test by in-fold data df_pred = md_fit >> ev_df(df=df >> tf_filter(var_in(X.index, Is[i])), append=False) ## Specialize summaries for output names summaries_all = ChainMap(*[{ key + "_" + o: fun(X[o + suffix], X[o]) for key, fun in summaries.items() } for o in out]) ## Aggregate df_summary_tmp = ( df_pred >> tf_bind_cols(df[out] >> tf_filter(var_in(X.index, Is[i]))) >> tf(**summaries_all) # >> tf_mutate(_kfold=i) ) if var_fold is None: df_summary_tmp = df_summary_tmp >> tf_mutate(_kfold=i) else: df_summary_tmp[var_fold] = levels[i] df_res = concat((df_res, df_summary_tmp), axis=0).reset_index(drop=True) return df_res
def tlmc_1f1m(md, N0, eps): import numpy as np import grama as gr X = gr.Intention() md1f1m = gr.make_tlmc_model_1f1m() # Check that md is OK --> same # inputs/outputs # Check inputs try: r = md.functions[0].func(0, 0) except TypeError: print( 'Input model must have 2 inputs: level and point at which to evaluate.' ) # Check outputs r = md.functions[0].func(0, 0) if len(r) != 2: raise ValueError( 'Level 0 function must have 2 outputs: result and cost.') r = md.functions[0].func(1, 0) if len(r) != 2: raise ValueError( 'Level 1 function must have 2 outputs: result and cost.') # Check that md has 1 function if len(md.functions) != 1: raise ValueError('Input model must have 1 function.') # make sure N0 and eps are greater than 0 if ((N0 <= 0) | (eps <= 0)): # make sure N0 and eps are greater than 0 raise ValueError('N0 and eps must be > 0.') its = 0 # initialize iteration counter Nlev = np.zeros((1, 2)) # samples taken per level (initialize) dNlev = np.array([[N0, N0]]) # samples left to take per level (initialize) Vlev = np.zeros((1, 2)) # variance per level (initialize) sumlev = np.zeros((2, 2)) # sample results per level (initialize) costlev = np.zeros((1, 2)) # total cost per level (initialize) while np.sum(dNlev) > 0: # check if there are samples left to be evaluated for lev in range(2): if dNlev[ 0, lev] > 0: # check if there are samples to be evaluated on level 'lev' df_mc_lev = md1f1m >> gr.ev_monte_carlo( n=dNlev[0, lev], df_det=gr.df_make(level=lev)) if lev > 0: df_prev = df_mc_lev >> gr.tf_select( gr.columns_between( "x", "level")) >> gr.tf_mutate(level=X.level - 1) df_mc_lev_prev = md1f1m >> gr.ev_df(df_prev) Y = df_mc_lev.P - df_mc_lev_prev.P C = sum(df_mc_lev.cost) + sum(df_mc_lev_prev.cost) else: Y = df_mc_lev.P C = sum(df_mc_lev.cost) cost = C sums = [sum(Y), sum(Y**2)] Nlev[0, lev] = Nlev[0, lev] + dNlev[ 0, lev] # update samples taken on level 'lev' sumlev[0, lev] = sumlev[0, lev] + sums[ 0] # update sample results on level 'lev' sumlev[1, lev] = sumlev[1, lev] + sums[ 1] # update sample results on level 'lev' costlev[0, lev] = costlev[ 0, lev] + cost # update total cost on level 'lev' mlev = np.abs(sumlev[0, :] / Nlev) # expected value per level Vlev = np.maximum( 0, (sumlev[1, :] / Nlev - mlev**2)) # variance per level Clev = costlev / Nlev # cost per result per level mu = eps**(-2) * sum(np.sqrt( Vlev * Clev)) # Lagrange multiplier to minimize variance for a fixed cost Ns = np.ceil( mu * np.sqrt(Vlev / Clev)) # optimal number of samples per level dNlev = np.maximum(0, Ns - Nlev) # update samples left to take per level its += 1 # update counter P = np.sum(sumlev[0, :] / Nlev) # evaluate two-level estimator return P, Nlev, Vlev, its