Example #1
0
def rets_to_weights(rets_this):
    best_idxs, metrics_best_idx, y_pred_best_idx = interpret_kfold_results(
        rets_this, "r2_score")
    weights = rets_this["models"]
    best_idx = best_idxs[0][0]
    weights_this = weights[:, 0, best_idx]
    return weights_this, metrics_best_idx
Example #2
0
def performance_by_num_features(X,
                                y,
                                num_features,
                                num_folds=5,
                                solve_function=solve.ridge_regression,
                                crit="r2_score",
                                **solve_kwargs):
    """
    Slices features into smaller subsets of featurization (by index), and reports
    performance of 5 folds on different feature dimensions d_i < d = X.shape[1]. If you want it
    done randomly, shuffle columns of X before inputing to the function.
    args:
        X: n x d array-like, feature representation
        y: n x 1 array-like, labels
        num_features: list of ints, num_features over which to collect performance results
        num_folds: int, default 5, number of cross validation folds
        solve_function: which function to use for the solve, default ridge regression.
        crit (str): citeria for which to optimize hps
        **solve_kwargs (**dict): dictionary of params for solve fxn
    returns:
        kfold_l_idxs_by_num_feats:list of ints, the best-agreed (across k-folds) lambda index swept over, by feature size d_i
        kfold_test_metrics_by_num_feats: 2d array of dicts, axis=0 corresponds to number of features, axis 1 to fold.
        \fold_test_predictions_by_num_feats: list of list of arrays, test set predictions results from
        each of k k-fold models, where lambda is  set according to l_idxs_by_num_feat for each train set size,
        uniformly across folds.
    """
    solve_kwargs["return_preds"] = True

    assert np.max(num_features) <= X.shape[1], "not enough features to satisfy"
    results_by_num_feat = []

    kfold_test_metrics_by_num_feats = []
    kfold_l_idxs_by_num_feats = []
    kfold_test_predictions_by_num_feats = []

    for i, num_feats in enumerate(num_features):
        res = solve.kfold_solve(X[:, :num_feats],
                                y,
                                num_folds=num_folds,
                                solve_function=solve_function,
                                **solve_kwargs)
        results_by_num_feat.append(res)

        (
            best_idxs,
            metrics_best_idx,
            y_pred_best_idx,
        ) = interpret_results.interpret_kfold_results(res, crit)
        kfold_test_metrics_by_num_feats.append(metrics_best_idx)
        kfold_l_idxs_by_num_feats.append(best_idxs)
        kfold_test_predictions_by_num_feats.append(y_pred_best_idx)

    return (
        np.array(kfold_l_idxs_by_num_feats),
        np.array(kfold_test_metrics_by_num_feats),
        np.array(kfold_test_predictions_by_num_feats),
    )
def spatial_scatter_obs_v_pred(
    kfold_results,
    latlons,
    model_info,
    val,
    s=4,
    save_dir=None,
    app_name=None,
    suffix=None,
    figsize=(14, 5),
    crit="r2_score",
    **kwargs
):
    """Plots side-by-side spatial scatters of observed and predicted values.

    Args:
        kfold_results (dict of ndarray) :
            As returned using kfold_solve()
        latlons (nx2 2darray) : lats (first col), lons (second col)
        model_info (str) :
            To append to title of the scatter plot,
            e.g. could pass in formation about which solve...etc it was.
        val (str or list of str):
            An ordered list of names of the outcomes in this model. If not
            multiple outcomes, this can be string. Otherwise must be a list of strings
            of length n_outcomes
        lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val.
            The format of this dict is val : (lower_bound,upper_bound). If no lim is set
            for a particular val, the default is the lower and upper bound of the observed
            and predicted outcomes combined.
        save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved.
        app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving
        suffix (str) : The suffix containing the grid, sample, and featurization parameters
            which will be appended to the filename when saving, in order to keep track of
            various sampling and gridding schemes. Only needed if saving
    """

    val = _adjust_val_names_str(val)

    # get metrics and preds for best HP's
    best_lambda_idx, best_metrics, best_preds = interpret_kfold_results(
        kfold_results, crits=crit
    )

    # flatten over fold predictions
    preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()])
    truth = np.vstack(
        [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()]
    )

    # get latlons in same shuffled, cross-validated order
    ll = latlons[
        np.hstack([test for train, test in kfold_results["cv"].split(latlons)])
    ]

    vmin = kwargs.pop("vmin", np.percentile(truth, 10, axis=0))
    vmax = kwargs.pop("vmin", np.percentile(truth, 90, axis=0))

    # plot obs and preds
    for vx, v in enumerate(val):
        fig, ax = plt.subplots(1, 2, figsize=figsize)
        sc0 = ax[0].scatter(
            ll[:, 1],
            ll[:, 0],
            c=truth[:, vx],
            cmap="viridis",
            alpha=1,
            s=s,
            vmin=vmin[vx],
            vmax=vmax[vx],
            **kwargs
        )
        sc1 = ax[1].scatter(
            ll[:, 1],
            ll[:, 0],
            c=preds[:, vx],
            cmap="viridis",
            alpha=1,
            s=s,
            vmin=vmin[vx],
            vmax=vmax[vx],
            **kwargs
        )
        fig.colorbar(sc0, ax=ax[0])
        fig.colorbar(sc1, ax=ax[1])
        fig.suptitle(v.title())
        ax[0].set_title("Observed")
        ax[1].set_title("Predicted")
        if save_dir:
            data = {
                "lon": ll[:, 1],
                "lat": ll[:, 0],
                "truth": truth[:, vx],
                "preds": preds[:, vx],
            }
            _savefig(fig, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix)
            _save_fig_data(
                data, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix
            )
def performance_density(
    kfold_results,
    model_info,
    val,
    lims={},
    save_dir=None,
    app_name=None,
    suffix=None,
    kind="kde",
    bw="scott",
    cut=3,
    size=10,
    alpha=0.25,
):
    """Plots a KDE plot of OOS preds across all folds vs obs.

    Args:
        kfold_results (dict of ndarray) :
            As returned using kfold_solve()
        model_info (str) :
            To append to title of the scatter plot,
            e.g. could pass in formation about which solve...etc it was.
        val (str or list of str):
            An ordered list of names of the outcomes in this model. If not
            multiple outcomes, this can be string. Otherwise must be a list of strings
            of length n_outcomes
        lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val.
            The format of this dict is val : (lower_bound,upper_bound). If no lim is set
            for a particular val, the default is the lower and upper bound of the observed
            and predicted outcomes combined.
        save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved.
        app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving
        suffix (str) : The suffix containing the grid, sample, and featurization parameters
            which will be appended to the filename when saving, in order to keep track of
            various sampling and gridding schemes. Only needed if saving
        kind (str) : Type of plot to draw. Default is KDE. Options:
            { “scatter” | “reg” | “resid” | “kde” | “hex”
        bw (‘scott’ | ‘silverman’ | scalar | pair of scalars, optional) : Bandwidth to use for kernel in kde
            plots. Default is 'scott'. Only implemented for kind='kde'
        cut (numeric) : Kernel is set to go to 0 at min/max data -/+ cut*bw. Only implemented for kind='kde'
    """

    val = _adjust_val_names_str(val)

    # get metrics and preds for best HP's
    best_lambda_idx, best_metrics, best_preds = interpret_kfold_results(
        kfold_results, crits="r2_score"
    )

    # flatten over fold predictions
    preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()])
    truth = np.vstack(
        [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()]
    )

    # loop over all outcome dimensions
    n_outcomes = preds.shape[1]
    for i in range(n_outcomes):

        this_truth = truth[:, i]
        this_preds = preds[:, i]
        this_val = val[i]

        # calc r2 before clipping
        r2 = metrics.r2_score(this_truth, this_preds)

        # set axis limits for kde plot
        if this_val in lims.keys():
            this_lims = lims[this_val]
        else:
            # select the min and max of input data, expanded by a tiny bit
            offset = (
                max(
                    [
                        this_truth.max() - this_truth.min(),
                        this_preds.max() - this_preds.min(),
                    ]
                )
                / 1000
            )
            this_min = min([this_preds.min(), this_truth.min()]) - offset
            this_max = max([this_preds.max(), this_truth.max()]) + offset
            this_lims = (this_min, this_max)

        print("Plotting {}...".format(this_val))

        # note that below code clips to axes limits before running kernel
        # so if you clip below a large amount of data, that data will be
        # ignored in the plotting (but not in the r2)
        marginal_kws = {}
        if kind == "kde":
            marginal_kws["bw"] = bw
            marginal_kws["clip"] = this_lims
            marginal_kws["cut"] = cut

        # extend the drawing of the joint distribution to the extremes of the
        # data
        joint_kws = marginal_kws.copy()
        if kind == "kde":
            joint_kws["extend"] = "both"

        with sns.axes_style("white"):
            jg = sns.jointplot(
                this_preds,
                this_truth,
                kind=kind,
                height=10,
                xlim=this_lims,
                ylim=this_lims,
                joint_kws=joint_kws,
                marginal_kws=marginal_kws,
                size=size,
                alpha=alpha,
            )

        ## add 1:1 line
        jg.ax_joint.plot(this_lims, this_lims, "k-", alpha=0.75)
        jg.ax_joint.set_xlabel("Predicted")
        jg.ax_joint.set_ylabel("Observed")
        jg.ax_joint.text(
            0.05, 0.95, "r2_score: {:.2f}".format(r2), transform=jg.ax_joint.transAxes
        )

        ## calc metrics
        plt.suptitle(
            "{} Model OOS Performance w/ k-fold CV ({})".format(
                this_val.title(), model_info.title()
            )
        )
        if save_dir:
            fig = plt.gcf()
            _savefig(
                fig,
                save_dir,
                app_name,
                this_val,
                "predVobs_kde",
                suffix,
                tight_layout=True,
            )

            kde_data = {"truth": this_truth, "preds": this_preds}
            _save_fig_data(
                kde_data, save_dir, app_name, this_val, "predVobs_kde", suffix
            )
Example #5
0
def performance_by_num_train_samples(X,
                                     y,
                                     num_samples,
                                     num_folds=5,
                                     solve_function=solve.ridge_regression,
                                     crit="r2_score",
                                     **solve_kwargs):
    """
    Slices features into smaller subsets of training set (randomization taken care of by Kfold), and reports
    performance of 5 folds on different train set sizes s_i < s = X.shape[0]*(num_folds-1)/num_folds.
    If you rows pulled randomly, shuffle rows of X before inputing to the function.
    args:
        X: n x d array-like, feature representation
        y: n x 1 array-like, labels
        num_samples: list of ints, train set sizes over which to collect performance results
        num_folds: int, default 5, number of cross validation folds
        solve_function: which function to use for the solve, default ridge regression.
        crit (str): citeria for which to optimize hps
        **solve_kwargs (**dict): dictionary of params for solve fxn
    returns:
        l_idxs_by_num_sample: list of ints, the best-agreed (across k-folds) lambda index swept over,
            by train set size
        fold_test_metrics_by_num_samples: list of dicts, results of each of k k-fold models, where lambda is
            set according to l_idxs_by_num_feat for each train set size, uniformly across folds.
            organized in order num_sample
        fold_test_predictions_by_num_samples: list of arrays, test set predictions results from
        each of k k-fold models, where lambda is  set according to l_idxs_by_num_feat for each train set size,
        uniformly across folds.
        num_samples_taken: the number of samples actually taken for each model.
    """

    solve_kwargs["return_preds"] = True

    if np.max(num_samples) > int(X.shape[0] * (num_folds - 1) / num_folds):
        warnings.warn(
            "not enough training points to satisfy {0} samples; ".format(
                np.max(num_samples)) +
            "we will use the maximum number available for the last ones which is {0}"
            .format(int(X.shape[0] * (num_folds - 1) / num_folds)))

    test_metrics_by_num_samples = []
    l_idxs_by_num_samples = []
    test_predictions_by_num_samples = []
    print(" on run (of {0}):".format(len(num_samples)), end=" ")

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)

    num_samples_taken = []
    for i, num_samp in enumerate(num_samples):
        print(i + 1, end=" ")
        results = []
        # take out the val set before sub-indexing. Because we fixed the random state of KFold, we will
        # get the same val_idxs for each fold every time.
        for train_idxs, val_idxs in kf.split(X):
            X_train = X[train_idxs]
            y_train = y[train_idxs]
            X_val = X[val_idxs]
            y_val = y[val_idxs]
            # now do results by number of samples.
            results_by_fold = solve.single_solve(X_train[:num_samp, :],
                                                 X_val,
                                                 y_train[:num_samp],
                                                 y_val,
                                                 solve_function=solve_function,
                                                 **solve_kwargs)
            results.append(results_by_fold)

        # record number of samples actually taken (only record for the last fold, should not differ between folds by
        # more than one).
        num_samples_taken.append(X_train[:num_samp, :].shape[0])

        # compile results as they should be
        results_compiled = {}
        for key in results[0].keys():
            # index everything by zero to avoid having an extra index when we send to interpret_results
            results_compiled[key] = np.array(
                [results[f][key][0] for f in range(num_folds)])

        # results should be packed as if they were all just in a single fold
        (
            best_idxs,
            metrics_best_idx,
            y_pred_best_idx,
        ) = interpret_results.interpret_kfold_results(results_compiled, crit)
        test_metrics_by_num_samples.append(metrics_best_idx)
        l_idxs_by_num_samples.append(best_idxs)
        test_predictions_by_num_samples.append(y_pred_best_idx)

    return (
        np.array(l_idxs_by_num_samples),
        test_metrics_by_num_samples,
        test_predictions_by_num_samples,
        num_samples_taken,
    )
def checkered_predictions_by_radius(
    X,
    y,
    latlons,
    radii,
    extent,
    num_jitter_positions_sqrt=1,
    min_points=0,
    return_hp_idxs=False,
    return_models=False,
    crit="r2_score",
    solve_function=solve.ridge_regression,
    **solve_kwargs,
):
    """
    Consider each grid cell as its own test set, while training on all other cells.
    args:
        X: n x d array of floats, feature matrix
        y: n x 1 array of floats, labels
        latlons: n x 2 array of floats, locations
        radii: list of floats, radii defining the grid at successive trials
        extent: 4x1 list/array of floats, total extent on which to define the grid, e.g. the U.S. is captured
            by extent = [25,48,-126,-65]
        num_jitter_positions_sqrt: int, how many jitter positions to use in each dimension
        min_points: int, the minimum number of points at which to define a set.
        return_hp_idxs: boolean, whether to return the optimal hyperparameter indicies
        return_models: boolean, whether to return the models
        crit: which criteria to optimize for (if not r2_score,
                you'll also have to set a flag in interpret_results for minimization)
        solve_function: fxn, which solve_function to use
        **solve_kwargs: dict of keyqord arguments that you want to pass to the solve function

    returns:
        rets_by_delta: a list of dictionary of results, where each dictionary in the list corresponds to results
            for one radius value. The structure of each dictionary depends on the arguments given in the option
            return arguments.
    """

    # The object to return
    rets_by_radius = []
    # For each radius
    for i, radius in enumerate(radii):
        print(f"Radius: {i + 1}/{len(radii)}: Offset: ", end="")

        # If we're not jittering, just do it once:
        if num_jitter_positions_sqrt == 1:
            print("1/1...")
            idxs_a, idxs_b = put_in_checkers(latlons,
                                             extent,
                                             radius * 2,
                                             min_points=0)
            rets_0 = checkered_predictions(
                X,
                y,
                latlons,
                idxs_a,
                idxs_b,
                radius,
                return_hp_idxs=return_hp_idxs,
                return_models=return_models,
                crit=crit,
                solve_function=solve_function,
                **solve_kwargs,
            )

        # If we are jittering, do the same thing as above but num_jitter_positions_sqrt^2 times.
        # use checkered_predictions_just_return_results to get the solve results, and aggregate
        # them with the other data to be returned, per jitter.
        else:
            center_offsets = np.linspace(0, radius, num_jitter_positions_sqrt)
            # print(center_offsets)
            rets_0 = {}

            # mimic the returns of kfold_solve
            jitter_metrics_test = []
            jitter_metrics_train = []
            jitter_preds_test = []
            jitter_preds_train = []
            jitter_models = []
            jitter_y_true_train = []
            jitter_y_true_test = []
            jitter_hp_warning = []

            for dx1, delta_1 in enumerate(center_offsets):
                for dx2, delta_2 in enumerate(center_offsets):
                    n_sample = num_jitter_positions_sqrt * dx1 + dx2 + 1
                    print(f"{n_sample}/{num_jitter_positions_sqrt**2}",
                          end="...")
                    idxs_a, idxs_b = put_in_checkers(
                        latlons,
                        extent,
                        radius * 2,
                        offset_x1=delta_1,
                        offset_x2=delta_2,
                        min_points=min_points,
                    )

                    rets_offset = checkered_predictions_just_return_results(
                        X,
                        y,
                        latlons,
                        idxs_a,
                        idxs_b,
                        radius,
                        return_hp_idxs=return_hp_idxs,
                        return_models=return_models,
                        crit=crit,
                        solve_function=solve_function,
                        **solve_kwargs,
                    )

                    ## case this into a kfold return - index everything by zero to avoid having an extra index
                    # record performance metrics
                    jitter_metrics_test.append(rets_offset["metrics_test"][0])
                    jitter_metrics_train.append(
                        rets_offset["metrics_train"][0])
                    # record true y
                    jitter_y_true_train.append(rets_offset["y_true_train"][0])
                    jitter_y_true_test.append(rets_offset["y_true_test"][0])
                    # record optional preds and model parameters
                    jitter_preds_test.append(rets_offset["y_pred_test"][0])
                    jitter_preds_train.append(rets_offset["y_pred_train"][0])
                    # record the hp_warnings so that they can be passed to interpret_results
                    jitter_hp_warning.append(rets_offset["hp_warning"][0])
                    # record the model as well if desired
                    if return_models:
                        jitter_models.append(rets_offset["models"][0])

            # Return results
            jittered_results_this_delta = {
                "metrics_test": np.array(jitter_metrics_test),
                "metrics_train": np.array(jitter_metrics_train),
                "y_true_test": np.array(jitter_y_true_test),
                "y_true_train": np.array(jitter_y_true_train),
                "y_pred_test": np.array(jitter_preds_test),
                "y_pred_train": np.array(jitter_preds_train),
                "deltas": (delta_1, delta_2),
                "hp_warning": np.array(jitter_hp_warning),
            }

            # note: each jitter is treated like a fold of kfold cross validation
            (
                best_hp_idxs,
                metrics_best_idx,
                y_pred_best_idx,
            ) = interpret_results.interpret_kfold_results(
                jittered_results_this_delta, crit)

            # default return is a list but there's only one variable so index all at 0
            rets_0["hp_idxs_chosen"] = best_hp_idxs[0]
            rets_0["metrics_test"] = metrics_best_idx[:, 0]
            rets_0["preds_test"] = y_pred_best_idx[:, 0]

        rets_by_radius.append(rets_0)
        print("")

    return rets_by_radius