def rets_to_weights(rets_this): best_idxs, metrics_best_idx, y_pred_best_idx = interpret_kfold_results( rets_this, "r2_score") weights = rets_this["models"] best_idx = best_idxs[0][0] weights_this = weights[:, 0, best_idx] return weights_this, metrics_best_idx
def performance_by_num_features(X, y, num_features, num_folds=5, solve_function=solve.ridge_regression, crit="r2_score", **solve_kwargs): """ Slices features into smaller subsets of featurization (by index), and reports performance of 5 folds on different feature dimensions d_i < d = X.shape[1]. If you want it done randomly, shuffle columns of X before inputing to the function. args: X: n x d array-like, feature representation y: n x 1 array-like, labels num_features: list of ints, num_features over which to collect performance results num_folds: int, default 5, number of cross validation folds solve_function: which function to use for the solve, default ridge regression. crit (str): citeria for which to optimize hps **solve_kwargs (**dict): dictionary of params for solve fxn returns: kfold_l_idxs_by_num_feats:list of ints, the best-agreed (across k-folds) lambda index swept over, by feature size d_i kfold_test_metrics_by_num_feats: 2d array of dicts, axis=0 corresponds to number of features, axis 1 to fold. \fold_test_predictions_by_num_feats: list of list of arrays, test set predictions results from each of k k-fold models, where lambda is set according to l_idxs_by_num_feat for each train set size, uniformly across folds. """ solve_kwargs["return_preds"] = True assert np.max(num_features) <= X.shape[1], "not enough features to satisfy" results_by_num_feat = [] kfold_test_metrics_by_num_feats = [] kfold_l_idxs_by_num_feats = [] kfold_test_predictions_by_num_feats = [] for i, num_feats in enumerate(num_features): res = solve.kfold_solve(X[:, :num_feats], y, num_folds=num_folds, solve_function=solve_function, **solve_kwargs) results_by_num_feat.append(res) ( best_idxs, metrics_best_idx, y_pred_best_idx, ) = interpret_results.interpret_kfold_results(res, crit) kfold_test_metrics_by_num_feats.append(metrics_best_idx) kfold_l_idxs_by_num_feats.append(best_idxs) kfold_test_predictions_by_num_feats.append(y_pred_best_idx) return ( np.array(kfold_l_idxs_by_num_feats), np.array(kfold_test_metrics_by_num_feats), np.array(kfold_test_predictions_by_num_feats), )
def spatial_scatter_obs_v_pred( kfold_results, latlons, model_info, val, s=4, save_dir=None, app_name=None, suffix=None, figsize=(14, 5), crit="r2_score", **kwargs ): """Plots side-by-side spatial scatters of observed and predicted values. Args: kfold_results (dict of ndarray) : As returned using kfold_solve() latlons (nx2 2darray) : lats (first col), lons (second col) model_info (str) : To append to title of the scatter plot, e.g. could pass in formation about which solve...etc it was. val (str or list of str): An ordered list of names of the outcomes in this model. If not multiple outcomes, this can be string. Otherwise must be a list of strings of length n_outcomes lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val. The format of this dict is val : (lower_bound,upper_bound). If no lim is set for a particular val, the default is the lower and upper bound of the observed and predicted outcomes combined. save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved. app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving suffix (str) : The suffix containing the grid, sample, and featurization parameters which will be appended to the filename when saving, in order to keep track of various sampling and gridding schemes. Only needed if saving """ val = _adjust_val_names_str(val) # get metrics and preds for best HP's best_lambda_idx, best_metrics, best_preds = interpret_kfold_results( kfold_results, crits=crit ) # flatten over fold predictions preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()]) truth = np.vstack( [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()] ) # get latlons in same shuffled, cross-validated order ll = latlons[ np.hstack([test for train, test in kfold_results["cv"].split(latlons)]) ] vmin = kwargs.pop("vmin", np.percentile(truth, 10, axis=0)) vmax = kwargs.pop("vmin", np.percentile(truth, 90, axis=0)) # plot obs and preds for vx, v in enumerate(val): fig, ax = plt.subplots(1, 2, figsize=figsize) sc0 = ax[0].scatter( ll[:, 1], ll[:, 0], c=truth[:, vx], cmap="viridis", alpha=1, s=s, vmin=vmin[vx], vmax=vmax[vx], **kwargs ) sc1 = ax[1].scatter( ll[:, 1], ll[:, 0], c=preds[:, vx], cmap="viridis", alpha=1, s=s, vmin=vmin[vx], vmax=vmax[vx], **kwargs ) fig.colorbar(sc0, ax=ax[0]) fig.colorbar(sc1, ax=ax[1]) fig.suptitle(v.title()) ax[0].set_title("Observed") ax[1].set_title("Predicted") if save_dir: data = { "lon": ll[:, 1], "lat": ll[:, 0], "truth": truth[:, vx], "preds": preds[:, vx], } _savefig(fig, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix) _save_fig_data( data, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix )
def performance_density( kfold_results, model_info, val, lims={}, save_dir=None, app_name=None, suffix=None, kind="kde", bw="scott", cut=3, size=10, alpha=0.25, ): """Plots a KDE plot of OOS preds across all folds vs obs. Args: kfold_results (dict of ndarray) : As returned using kfold_solve() model_info (str) : To append to title of the scatter plot, e.g. could pass in formation about which solve...etc it was. val (str or list of str): An ordered list of names of the outcomes in this model. If not multiple outcomes, this can be string. Otherwise must be a list of strings of length n_outcomes lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val. The format of this dict is val : (lower_bound,upper_bound). If no lim is set for a particular val, the default is the lower and upper bound of the observed and predicted outcomes combined. save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved. app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving suffix (str) : The suffix containing the grid, sample, and featurization parameters which will be appended to the filename when saving, in order to keep track of various sampling and gridding schemes. Only needed if saving kind (str) : Type of plot to draw. Default is KDE. Options: { “scatter” | “reg” | “resid” | “kde” | “hex” bw (‘scott’ | ‘silverman’ | scalar | pair of scalars, optional) : Bandwidth to use for kernel in kde plots. Default is 'scott'. Only implemented for kind='kde' cut (numeric) : Kernel is set to go to 0 at min/max data -/+ cut*bw. Only implemented for kind='kde' """ val = _adjust_val_names_str(val) # get metrics and preds for best HP's best_lambda_idx, best_metrics, best_preds = interpret_kfold_results( kfold_results, crits="r2_score" ) # flatten over fold predictions preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()]) truth = np.vstack( [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()] ) # loop over all outcome dimensions n_outcomes = preds.shape[1] for i in range(n_outcomes): this_truth = truth[:, i] this_preds = preds[:, i] this_val = val[i] # calc r2 before clipping r2 = metrics.r2_score(this_truth, this_preds) # set axis limits for kde plot if this_val in lims.keys(): this_lims = lims[this_val] else: # select the min and max of input data, expanded by a tiny bit offset = ( max( [ this_truth.max() - this_truth.min(), this_preds.max() - this_preds.min(), ] ) / 1000 ) this_min = min([this_preds.min(), this_truth.min()]) - offset this_max = max([this_preds.max(), this_truth.max()]) + offset this_lims = (this_min, this_max) print("Plotting {}...".format(this_val)) # note that below code clips to axes limits before running kernel # so if you clip below a large amount of data, that data will be # ignored in the plotting (but not in the r2) marginal_kws = {} if kind == "kde": marginal_kws["bw"] = bw marginal_kws["clip"] = this_lims marginal_kws["cut"] = cut # extend the drawing of the joint distribution to the extremes of the # data joint_kws = marginal_kws.copy() if kind == "kde": joint_kws["extend"] = "both" with sns.axes_style("white"): jg = sns.jointplot( this_preds, this_truth, kind=kind, height=10, xlim=this_lims, ylim=this_lims, joint_kws=joint_kws, marginal_kws=marginal_kws, size=size, alpha=alpha, ) ## add 1:1 line jg.ax_joint.plot(this_lims, this_lims, "k-", alpha=0.75) jg.ax_joint.set_xlabel("Predicted") jg.ax_joint.set_ylabel("Observed") jg.ax_joint.text( 0.05, 0.95, "r2_score: {:.2f}".format(r2), transform=jg.ax_joint.transAxes ) ## calc metrics plt.suptitle( "{} Model OOS Performance w/ k-fold CV ({})".format( this_val.title(), model_info.title() ) ) if save_dir: fig = plt.gcf() _savefig( fig, save_dir, app_name, this_val, "predVobs_kde", suffix, tight_layout=True, ) kde_data = {"truth": this_truth, "preds": this_preds} _save_fig_data( kde_data, save_dir, app_name, this_val, "predVobs_kde", suffix )
def performance_by_num_train_samples(X, y, num_samples, num_folds=5, solve_function=solve.ridge_regression, crit="r2_score", **solve_kwargs): """ Slices features into smaller subsets of training set (randomization taken care of by Kfold), and reports performance of 5 folds on different train set sizes s_i < s = X.shape[0]*(num_folds-1)/num_folds. If you rows pulled randomly, shuffle rows of X before inputing to the function. args: X: n x d array-like, feature representation y: n x 1 array-like, labels num_samples: list of ints, train set sizes over which to collect performance results num_folds: int, default 5, number of cross validation folds solve_function: which function to use for the solve, default ridge regression. crit (str): citeria for which to optimize hps **solve_kwargs (**dict): dictionary of params for solve fxn returns: l_idxs_by_num_sample: list of ints, the best-agreed (across k-folds) lambda index swept over, by train set size fold_test_metrics_by_num_samples: list of dicts, results of each of k k-fold models, where lambda is set according to l_idxs_by_num_feat for each train set size, uniformly across folds. organized in order num_sample fold_test_predictions_by_num_samples: list of arrays, test set predictions results from each of k k-fold models, where lambda is set according to l_idxs_by_num_feat for each train set size, uniformly across folds. num_samples_taken: the number of samples actually taken for each model. """ solve_kwargs["return_preds"] = True if np.max(num_samples) > int(X.shape[0] * (num_folds - 1) / num_folds): warnings.warn( "not enough training points to satisfy {0} samples; ".format( np.max(num_samples)) + "we will use the maximum number available for the last ones which is {0}" .format(int(X.shape[0] * (num_folds - 1) / num_folds))) test_metrics_by_num_samples = [] l_idxs_by_num_samples = [] test_predictions_by_num_samples = [] print(" on run (of {0}):".format(len(num_samples)), end=" ") kf = KFold(n_splits=num_folds, shuffle=True, random_state=0) num_samples_taken = [] for i, num_samp in enumerate(num_samples): print(i + 1, end=" ") results = [] # take out the val set before sub-indexing. Because we fixed the random state of KFold, we will # get the same val_idxs for each fold every time. for train_idxs, val_idxs in kf.split(X): X_train = X[train_idxs] y_train = y[train_idxs] X_val = X[val_idxs] y_val = y[val_idxs] # now do results by number of samples. results_by_fold = solve.single_solve(X_train[:num_samp, :], X_val, y_train[:num_samp], y_val, solve_function=solve_function, **solve_kwargs) results.append(results_by_fold) # record number of samples actually taken (only record for the last fold, should not differ between folds by # more than one). num_samples_taken.append(X_train[:num_samp, :].shape[0]) # compile results as they should be results_compiled = {} for key in results[0].keys(): # index everything by zero to avoid having an extra index when we send to interpret_results results_compiled[key] = np.array( [results[f][key][0] for f in range(num_folds)]) # results should be packed as if they were all just in a single fold ( best_idxs, metrics_best_idx, y_pred_best_idx, ) = interpret_results.interpret_kfold_results(results_compiled, crit) test_metrics_by_num_samples.append(metrics_best_idx) l_idxs_by_num_samples.append(best_idxs) test_predictions_by_num_samples.append(y_pred_best_idx) return ( np.array(l_idxs_by_num_samples), test_metrics_by_num_samples, test_predictions_by_num_samples, num_samples_taken, )
def checkered_predictions_by_radius( X, y, latlons, radii, extent, num_jitter_positions_sqrt=1, min_points=0, return_hp_idxs=False, return_models=False, crit="r2_score", solve_function=solve.ridge_regression, **solve_kwargs, ): """ Consider each grid cell as its own test set, while training on all other cells. args: X: n x d array of floats, feature matrix y: n x 1 array of floats, labels latlons: n x 2 array of floats, locations radii: list of floats, radii defining the grid at successive trials extent: 4x1 list/array of floats, total extent on which to define the grid, e.g. the U.S. is captured by extent = [25,48,-126,-65] num_jitter_positions_sqrt: int, how many jitter positions to use in each dimension min_points: int, the minimum number of points at which to define a set. return_hp_idxs: boolean, whether to return the optimal hyperparameter indicies return_models: boolean, whether to return the models crit: which criteria to optimize for (if not r2_score, you'll also have to set a flag in interpret_results for minimization) solve_function: fxn, which solve_function to use **solve_kwargs: dict of keyqord arguments that you want to pass to the solve function returns: rets_by_delta: a list of dictionary of results, where each dictionary in the list corresponds to results for one radius value. The structure of each dictionary depends on the arguments given in the option return arguments. """ # The object to return rets_by_radius = [] # For each radius for i, radius in enumerate(radii): print(f"Radius: {i + 1}/{len(radii)}: Offset: ", end="") # If we're not jittering, just do it once: if num_jitter_positions_sqrt == 1: print("1/1...") idxs_a, idxs_b = put_in_checkers(latlons, extent, radius * 2, min_points=0) rets_0 = checkered_predictions( X, y, latlons, idxs_a, idxs_b, radius, return_hp_idxs=return_hp_idxs, return_models=return_models, crit=crit, solve_function=solve_function, **solve_kwargs, ) # If we are jittering, do the same thing as above but num_jitter_positions_sqrt^2 times. # use checkered_predictions_just_return_results to get the solve results, and aggregate # them with the other data to be returned, per jitter. else: center_offsets = np.linspace(0, radius, num_jitter_positions_sqrt) # print(center_offsets) rets_0 = {} # mimic the returns of kfold_solve jitter_metrics_test = [] jitter_metrics_train = [] jitter_preds_test = [] jitter_preds_train = [] jitter_models = [] jitter_y_true_train = [] jitter_y_true_test = [] jitter_hp_warning = [] for dx1, delta_1 in enumerate(center_offsets): for dx2, delta_2 in enumerate(center_offsets): n_sample = num_jitter_positions_sqrt * dx1 + dx2 + 1 print(f"{n_sample}/{num_jitter_positions_sqrt**2}", end="...") idxs_a, idxs_b = put_in_checkers( latlons, extent, radius * 2, offset_x1=delta_1, offset_x2=delta_2, min_points=min_points, ) rets_offset = checkered_predictions_just_return_results( X, y, latlons, idxs_a, idxs_b, radius, return_hp_idxs=return_hp_idxs, return_models=return_models, crit=crit, solve_function=solve_function, **solve_kwargs, ) ## case this into a kfold return - index everything by zero to avoid having an extra index # record performance metrics jitter_metrics_test.append(rets_offset["metrics_test"][0]) jitter_metrics_train.append( rets_offset["metrics_train"][0]) # record true y jitter_y_true_train.append(rets_offset["y_true_train"][0]) jitter_y_true_test.append(rets_offset["y_true_test"][0]) # record optional preds and model parameters jitter_preds_test.append(rets_offset["y_pred_test"][0]) jitter_preds_train.append(rets_offset["y_pred_train"][0]) # record the hp_warnings so that they can be passed to interpret_results jitter_hp_warning.append(rets_offset["hp_warning"][0]) # record the model as well if desired if return_models: jitter_models.append(rets_offset["models"][0]) # Return results jittered_results_this_delta = { "metrics_test": np.array(jitter_metrics_test), "metrics_train": np.array(jitter_metrics_train), "y_true_test": np.array(jitter_y_true_test), "y_true_train": np.array(jitter_y_true_train), "y_pred_test": np.array(jitter_preds_test), "y_pred_train": np.array(jitter_preds_train), "deltas": (delta_1, delta_2), "hp_warning": np.array(jitter_hp_warning), } # note: each jitter is treated like a fold of kfold cross validation ( best_hp_idxs, metrics_best_idx, y_pred_best_idx, ) = interpret_results.interpret_kfold_results( jittered_results_this_delta, crit) # default return is a list but there's only one variable so index all at 0 rets_0["hp_idxs_chosen"] = best_hp_idxs[0] rets_0["metrics_test"] = metrics_best_idx[:, 0] rets_0["preds_test"] = y_pred_best_idx[:, 0] rets_by_radius.append(rets_0) print("") return rets_by_radius