def smooth(D, y, sigma): """Given a matrix ``D`` defining the squared distance between training and prediction points, and a matrix or vector y defining one or more lets of labels at the training points, return predicitons using an RBF kernel. Parameters ---------- D : array-like Squared distance matrix. $D_{i,j}$ defines the squared distance between training point ``i`` and prediction point ``j``. y : array-like Labels for training points. May be 1d if a single prediction task or 2d if >1 prediction tasks. sigma : int The length scale for RBF smoothing. i.e. the weight of a training point is proportional to $exp((-.5/sigma**2)*D_{i,j})$ Returns ------- smoothed_predictions : array-like Predicted labels """ y = solve_functions.y_to_matrix(y) # get RBF smoothing matrix S = da.exp((-0.5 / sigma**2) * D) # if sigma is small enough, weights could turn to 0, so we reset those to # just guess the average of the training set S = da.where(S.sum(axis=1).reshape(-1, 1) > 0, S, 1) smoothed_predictions = S.dot(y) / S.sum(axis=1).reshape(-1, 1) return smoothed_predictions
def rbf_interpolate_solve( latlons_train, latlons_val, y_train, y_val, return_preds=True, return_model=True, interpolate_train=False, clip_bounds=None, sigmas=[1], ): """ Uses latlons to do non-parametric estimation of prediction points using RBF kernel Note that the latlons have taken the place of the features. latlons_train: training instances latlons_val: validation instances y_train: training labels y_val: validation labels return_preds (optional): bool, whether to return predictions return_preds (optional): bool, whether to return the model interpolate_train (optional): bool, whether to also interpolate training instance (e.g. for computing training set error) clip_bounds (None or n_outcomes X 2 2darray): If not None, clip the predictions to these bounds. If any of the elements of the array are None, ignore that bound (e.g. if a row of the array is [None, 10], apply an upper bound of 10 but no lower bound). sigmas: rbf kernel params to sweep over in the solve (as hyperparamers like lambda for ridge regression) """ # if you've got a one dimensional response variable (y is just one column), make sure that it is properly formatted y_train, y_val = ( solve_functions.y_to_matrix(y_train), solve_functions.y_to_matrix(y_val), ) # get dimensions needed to shape arrays n_ftrs, n_outcomes, n_obs_train, n_obs_test = solve_functions.get_dim_lengths( latlons_train, y_train, y_val) n_sigmas = len(sigmas) # set up the data structures for reporting results results_dict = solve_functions._initialize_results_arrays( (n_outcomes, n_sigmas), return_preds, return_model) # to take advantage of GPU/CPU cores, convert to dask arrays # making sure we're setting threadpool limits to avoid oversubscribing threads with threadpool_limits(non_dask_thread_limit): latlons_train, latlons_val, y_train_da = [ da.from_array(xp.asarray(i), chunks=(DASK_CHUNKSIZE, None)) for i in [latlons_train, latlons_val, y_train] ] # calculate distances (ignore dask warnings about chunk size increase) - # this is intentional with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=da.PerformanceWarning) D_valtrain = da.blockwise( dist_sq_matrix, "ij", latlons_val, "ik", latlons_train, "jk", dtype=xp.float64, concatenate=True, ) if GPU: mempool.free_all_blocks() if interpolate_train: D_traintrain = da.blockwise( dist_sq_matrix, "ij", latlons_train, "ik", latlons_train, "jk", dtype=xp.float64, concatenate=True, ) # loop over all length scales that we are testing for g, sigma in enumerate(sigmas): smoothed_predictions_val = smooth(D_valtrain, y_train_da, sigma) if interpolate_train: smoothed_predictions_train = smooth(D_traintrain, y_train_da, sigma) else: smoothed_predictions_train = y_train_da # transfer from gpu if needed and turn into in-mem numpy arrays smoothed_predictions_train, smoothed_predictions_val = [ io.gpu_return_and_clear(i).compute() for i in [smoothed_predictions_train, smoothed_predictions_val] ] # clip if needed (this is more easily done in numpy b/c dask does # not support assignment by slices) if clip_bounds is not None: for ix, i in enumerate(clip_bounds): # only apply if both bounds aren't None for this outcome if not (i == None).all(): smoothed_predictions_train[:, ix] = smoothed_predictions_train[:, ix].clip( *i) smoothed_predictions_val[:, ix] = smoothed_predictions_val[:, ix].clip( * i ) # assign "model" as sigma param model = sigma # create tuple of lambda index to match argument structure # of _fill_results_arrays function hp_tuple = (g, ) # populate results dict with results from this sigma results_dict = solve_functions._fill_results_arrays( y_train, y_val, smoothed_predictions_train, smoothed_predictions_val, model, hp_tuple, results_dict, ) # should not actually return r2 of 1 if didn't smooth training # instead return NaN for i in results_dict["metrics_train"][0]: for j in i.keys(): i[j] = np.nan return results_dict
def performance_density( kfold_results, model_info, val, lims={}, save_dir=None, app_name=None, suffix=None, kind="kde", bw="scott", cut=3, size=10, alpha=0.25, ): """Plots a KDE plot of OOS preds across all folds vs obs. Args: kfold_results (dict of ndarray) : As returned using kfold_solve() model_info (str) : To append to title of the scatter plot, e.g. could pass in formation about which solve...etc it was. val (str or list of str): An ordered list of names of the outcomes in this model. If not multiple outcomes, this can be string. Otherwise must be a list of strings of length n_outcomes lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val. The format of this dict is val : (lower_bound,upper_bound). If no lim is set for a particular val, the default is the lower and upper bound of the observed and predicted outcomes combined. save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved. app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving suffix (str) : The suffix containing the grid, sample, and featurization parameters which will be appended to the filename when saving, in order to keep track of various sampling and gridding schemes. Only needed if saving kind (str) : Type of plot to draw. Default is KDE. Options: { “scatter” | “reg” | “resid” | “kde” | “hex” bw (‘scott’ | ‘silverman’ | scalar | pair of scalars, optional) : Bandwidth to use for kernel in kde plots. Default is 'scott'. Only implemented for kind='kde' cut (numeric) : Kernel is set to go to 0 at min/max data -/+ cut*bw. Only implemented for kind='kde' """ val = _adjust_val_names_str(val) # get metrics and preds for best HP's best_lambda_idx, best_metrics, best_preds = interpret_kfold_results( kfold_results, crits="r2_score" ) # flatten over fold predictions preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()]) truth = np.vstack( [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()] ) # loop over all outcome dimensions n_outcomes = preds.shape[1] for i in range(n_outcomes): this_truth = truth[:, i] this_preds = preds[:, i] this_val = val[i] # calc r2 before clipping r2 = metrics.r2_score(this_truth, this_preds) # set axis limits for kde plot if this_val in lims.keys(): this_lims = lims[this_val] else: # select the min and max of input data, expanded by a tiny bit offset = ( max( [ this_truth.max() - this_truth.min(), this_preds.max() - this_preds.min(), ] ) / 1000 ) this_min = min([this_preds.min(), this_truth.min()]) - offset this_max = max([this_preds.max(), this_truth.max()]) + offset this_lims = (this_min, this_max) print("Plotting {}...".format(this_val)) # note that below code clips to axes limits before running kernel # so if you clip below a large amount of data, that data will be # ignored in the plotting (but not in the r2) marginal_kws = {} if kind == "kde": marginal_kws["bw"] = bw marginal_kws["clip"] = this_lims marginal_kws["cut"] = cut # extend the drawing of the joint distribution to the extremes of the # data joint_kws = marginal_kws.copy() if kind == "kde": joint_kws["extend"] = "both" with sns.axes_style("white"): jg = sns.jointplot( this_preds, this_truth, kind=kind, height=10, xlim=this_lims, ylim=this_lims, joint_kws=joint_kws, marginal_kws=marginal_kws, size=size, alpha=alpha, ) ## add 1:1 line jg.ax_joint.plot(this_lims, this_lims, "k-", alpha=0.75) jg.ax_joint.set_xlabel("Predicted") jg.ax_joint.set_ylabel("Observed") jg.ax_joint.text( 0.05, 0.95, "r2_score: {:.2f}".format(r2), transform=jg.ax_joint.transAxes ) ## calc metrics plt.suptitle( "{} Model OOS Performance w/ k-fold CV ({})".format( this_val.title(), model_info.title() ) ) if save_dir: fig = plt.gcf() _savefig( fig, save_dir, app_name, this_val, "predVobs_kde", suffix, tight_layout=True, ) kde_data = {"truth": this_truth, "preds": this_preds} _save_fig_data( kde_data, save_dir, app_name, this_val, "predVobs_kde", suffix )
def spatial_scatter_obs_v_pred( kfold_results, latlons, model_info, val, s=4, save_dir=None, app_name=None, suffix=None, figsize=(14, 5), crit="r2_score", **kwargs ): """Plots side-by-side spatial scatters of observed and predicted values. Args: kfold_results (dict of ndarray) : As returned using kfold_solve() latlons (nx2 2darray) : lats (first col), lons (second col) model_info (str) : To append to title of the scatter plot, e.g. could pass in formation about which solve...etc it was. val (str or list of str): An ordered list of names of the outcomes in this model. If not multiple outcomes, this can be string. Otherwise must be a list of strings of length n_outcomes lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val. The format of this dict is val : (lower_bound,upper_bound). If no lim is set for a particular val, the default is the lower and upper bound of the observed and predicted outcomes combined. save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved. app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving suffix (str) : The suffix containing the grid, sample, and featurization parameters which will be appended to the filename when saving, in order to keep track of various sampling and gridding schemes. Only needed if saving """ val = _adjust_val_names_str(val) # get metrics and preds for best HP's best_lambda_idx, best_metrics, best_preds = interpret_kfold_results( kfold_results, crits=crit ) # flatten over fold predictions preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()]) truth = np.vstack( [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()] ) # get latlons in same shuffled, cross-validated order ll = latlons[ np.hstack([test for train, test in kfold_results["cv"].split(latlons)]) ] vmin = kwargs.pop("vmin", np.percentile(truth, 10, axis=0)) vmax = kwargs.pop("vmin", np.percentile(truth, 90, axis=0)) # plot obs and preds for vx, v in enumerate(val): fig, ax = plt.subplots(1, 2, figsize=figsize) sc0 = ax[0].scatter( ll[:, 1], ll[:, 0], c=truth[:, vx], cmap="viridis", alpha=1, s=s, vmin=vmin[vx], vmax=vmax[vx], **kwargs ) sc1 = ax[1].scatter( ll[:, 1], ll[:, 0], c=preds[:, vx], cmap="viridis", alpha=1, s=s, vmin=vmin[vx], vmax=vmax[vx], **kwargs ) fig.colorbar(sc0, ax=ax[0]) fig.colorbar(sc1, ax=ax[1]) fig.suptitle(v.title()) ax[0].set_title("Observed") ax[1].set_title("Predicted") if save_dir: data = { "lon": ll[:, 1], "lat": ll[:, 0], "truth": truth[:, vx], "preds": preds[:, vx], } _savefig(fig, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix) _save_fig_data( data, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix )
def kfold_solve_no_overlap(X, y, solve_function=solver.ridge_regression, num_folds=5, return_preds=True, return_model=False, **kwargs_solve): assert num_folds > 1 y = solver.y_to_matrix(y) n_outcomes = y.shape[1] # keep track of all runs over several iterations kfold_metrics_test = [] kfold_metrics_train = [] kfold_preds_test = [] kfold_preds_train = [] kfold_y_train = [] kfold_y_test = [] kfold_models = [] hp_warnings = [] print("on fold (of {0}): ".format(num_folds), end="") kf = KFold(n_splits=num_folds, shuffle=True, random_state=0) split = kf.split(X) kf_split_idxs = [] for i, (_, val_idxs_i) in enumerate(split): kf_split_idxs.append(val_idxs_i) val_idxs = kf_split_idxs[0] train_splits = kf_split_idxs[1:] i = 0 for train_idxs in train_splits: i += 1 print("{0} ".format(i), end="") X_train, X_val = X[train_idxs], X[val_idxs] y_train, y_val = y[train_idxs], y[val_idxs] # record train/test obs for this split this_y_train = np.empty(n_outcomes, dtype=np.ndarray) this_y_val = np.empty(n_outcomes, dtype=np.ndarray) for o in range(n_outcomes): this_y_train[o] = y_train[:, o] this_y_val[o] = y_val[:, o] kfold_y_train.append(this_y_train) kfold_y_test.append(this_y_val) # call solve func solve_results = solve_function(X_train, X_val, y_train, y_val, return_preds=return_preds, return_model=return_model, **kwargs_solve) # record performance metrics kfold_metrics_test.append(solve_results["metrics_test"]) kfold_metrics_train.append(solve_results["metrics_train"]) # record optional preds and model parameters if return_preds: kfold_preds_test.append(solve_results["y_pred_test"]) kfold_preds_train.append(solve_results["y_pred_train"]) if return_model: kfold_models.append(solve_results["models"]) # recpord np warnings hp_warnings.append(solve_results["hp_warning"]) # Return results rets = { "metrics_test": np.array(kfold_metrics_test), "metrics_train": np.array(kfold_metrics_train), "y_true_test": np.array(kfold_y_test), "y_true_train": np.array(kfold_y_train), "hp_warning": np.array(hp_warnings), "cv": kf, } if return_preds: rets["y_pred_test"] = np.array(kfold_preds_test) rets["y_pred_train"] = np.array(kfold_preds_train) if return_model: rets["models"] = np.array(kfold_models) return rets