def smooth(D, y, sigma):
    """Given a matrix ``D`` defining the squared distance between training and
    prediction points, and a matrix or vector y defining one or more lets of labels at
    the training points, return predicitons using an RBF kernel.

    Parameters
    ----------
    D : array-like
        Squared distance matrix. $D_{i,j}$ defines the squared distance between training
        point ``i`` and prediction point ``j``.
    y : array-like
        Labels for training points. May be 1d if a single prediction task or 2d if >1
        prediction tasks.
    sigma : int
        The length scale for RBF smoothing. i.e. the weight of a training point is
        proportional to $exp((-.5/sigma**2)*D_{i,j})$

    Returns
    -------
    smoothed_predictions : array-like
        Predicted labels
    """
    y = solve_functions.y_to_matrix(y)

    # get RBF smoothing matrix
    S = da.exp((-0.5 / sigma**2) * D)

    # if sigma is small enough, weights could turn to 0, so we reset those to
    # just guess the average of the training set
    S = da.where(S.sum(axis=1).reshape(-1, 1) > 0, S, 1)
    smoothed_predictions = S.dot(y) / S.sum(axis=1).reshape(-1, 1)

    return smoothed_predictions
def rbf_interpolate_solve(
    latlons_train,
    latlons_val,
    y_train,
    y_val,
    return_preds=True,
    return_model=True,
    interpolate_train=False,
    clip_bounds=None,
    sigmas=[1],
):
    """
    Uses latlons to do non-parametric estimation of prediction points using RBF kernel
    Note that the latlons have taken the place of the features.

    latlons_train: training instances
    latlons_val: validation instances
    y_train: training labels
    y_val: validation labels
    return_preds (optional): bool, whether to return predictions
    return_preds (optional): bool, whether to return the model
    interpolate_train (optional): bool, whether to also interpolate training instance
        (e.g. for computing training set error)
    clip_bounds (None or n_outcomes X 2 2darray): If not None, clip the predictions to these bounds.
            If any of the elements of the array are None, ignore that bound (e.g. if a row of the array
            is [None, 10], apply an upper bound of 10 but no lower bound).
    sigmas: rbf kernel params to sweep over in the solve (as hyperparamers like lambda for ridge regression)
    """
    # if you've got a one dimensional response variable (y is just one column), make sure that it is properly formatted
    y_train, y_val = (
        solve_functions.y_to_matrix(y_train),
        solve_functions.y_to_matrix(y_val),
    )

    # get dimensions needed to shape arrays
    n_ftrs, n_outcomes, n_obs_train, n_obs_test = solve_functions.get_dim_lengths(
        latlons_train, y_train, y_val)
    n_sigmas = len(sigmas)

    # set up the data structures for reporting results
    results_dict = solve_functions._initialize_results_arrays(
        (n_outcomes, n_sigmas), return_preds, return_model)

    # to take advantage of GPU/CPU cores, convert to dask arrays
    # making sure we're setting threadpool limits to avoid oversubscribing threads
    with threadpool_limits(non_dask_thread_limit):
        latlons_train, latlons_val, y_train_da = [
            da.from_array(xp.asarray(i), chunks=(DASK_CHUNKSIZE, None))
            for i in [latlons_train, latlons_val, y_train]
        ]

        # calculate distances (ignore dask warnings about chunk size increase) -
        # this is intentional
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=da.PerformanceWarning)
            D_valtrain = da.blockwise(
                dist_sq_matrix,
                "ij",
                latlons_val,
                "ik",
                latlons_train,
                "jk",
                dtype=xp.float64,
                concatenate=True,
            )
            if GPU:
                mempool.free_all_blocks()
            if interpolate_train:
                D_traintrain = da.blockwise(
                    dist_sq_matrix,
                    "ij",
                    latlons_train,
                    "ik",
                    latlons_train,
                    "jk",
                    dtype=xp.float64,
                    concatenate=True,
                )

        # loop over all length scales that we are testing
        for g, sigma in enumerate(sigmas):
            smoothed_predictions_val = smooth(D_valtrain, y_train_da, sigma)
            if interpolate_train:
                smoothed_predictions_train = smooth(D_traintrain, y_train_da,
                                                    sigma)
            else:
                smoothed_predictions_train = y_train_da

            # transfer from gpu if needed and turn into in-mem numpy arrays
            smoothed_predictions_train, smoothed_predictions_val = [
                io.gpu_return_and_clear(i).compute() for i in
                [smoothed_predictions_train, smoothed_predictions_val]
            ]

            # clip if needed (this is more easily done in numpy b/c dask does
            # not support assignment by slices)
            if clip_bounds is not None:
                for ix, i in enumerate(clip_bounds):
                    # only apply if both bounds aren't None for this outcome
                    if not (i == None).all():
                        smoothed_predictions_train[:,
                                                   ix] = smoothed_predictions_train[:, ix].clip(
                                                       *i)
                        smoothed_predictions_val[:,
                                                 ix] = smoothed_predictions_val[:,
                                                                                ix].clip(
                                                                                    *
                                                                                    i
                                                                                )

            # assign "model" as sigma param
            model = sigma

            # create tuple of lambda index to match argument structure
            # of _fill_results_arrays function
            hp_tuple = (g, )

            # populate results dict with results from this sigma
            results_dict = solve_functions._fill_results_arrays(
                y_train,
                y_val,
                smoothed_predictions_train,
                smoothed_predictions_val,
                model,
                hp_tuple,
                results_dict,
            )

    # should not actually return r2 of 1 if didn't smooth training
    # instead return NaN
    for i in results_dict["metrics_train"][0]:
        for j in i.keys():
            i[j] = np.nan

    return results_dict
def performance_density(
    kfold_results,
    model_info,
    val,
    lims={},
    save_dir=None,
    app_name=None,
    suffix=None,
    kind="kde",
    bw="scott",
    cut=3,
    size=10,
    alpha=0.25,
):
    """Plots a KDE plot of OOS preds across all folds vs obs.

    Args:
        kfold_results (dict of ndarray) :
            As returned using kfold_solve()
        model_info (str) :
            To append to title of the scatter plot,
            e.g. could pass in formation about which solve...etc it was.
        val (str or list of str):
            An ordered list of names of the outcomes in this model. If not
            multiple outcomes, this can be string. Otherwise must be a list of strings
            of length n_outcomes
        lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val.
            The format of this dict is val : (lower_bound,upper_bound). If no lim is set
            for a particular val, the default is the lower and upper bound of the observed
            and predicted outcomes combined.
        save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved.
        app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving
        suffix (str) : The suffix containing the grid, sample, and featurization parameters
            which will be appended to the filename when saving, in order to keep track of
            various sampling and gridding schemes. Only needed if saving
        kind (str) : Type of plot to draw. Default is KDE. Options:
            { “scatter” | “reg” | “resid” | “kde” | “hex”
        bw (‘scott’ | ‘silverman’ | scalar | pair of scalars, optional) : Bandwidth to use for kernel in kde
            plots. Default is 'scott'. Only implemented for kind='kde'
        cut (numeric) : Kernel is set to go to 0 at min/max data -/+ cut*bw. Only implemented for kind='kde'
    """

    val = _adjust_val_names_str(val)

    # get metrics and preds for best HP's
    best_lambda_idx, best_metrics, best_preds = interpret_kfold_results(
        kfold_results, crits="r2_score"
    )

    # flatten over fold predictions
    preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()])
    truth = np.vstack(
        [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()]
    )

    # loop over all outcome dimensions
    n_outcomes = preds.shape[1]
    for i in range(n_outcomes):

        this_truth = truth[:, i]
        this_preds = preds[:, i]
        this_val = val[i]

        # calc r2 before clipping
        r2 = metrics.r2_score(this_truth, this_preds)

        # set axis limits for kde plot
        if this_val in lims.keys():
            this_lims = lims[this_val]
        else:
            # select the min and max of input data, expanded by a tiny bit
            offset = (
                max(
                    [
                        this_truth.max() - this_truth.min(),
                        this_preds.max() - this_preds.min(),
                    ]
                )
                / 1000
            )
            this_min = min([this_preds.min(), this_truth.min()]) - offset
            this_max = max([this_preds.max(), this_truth.max()]) + offset
            this_lims = (this_min, this_max)

        print("Plotting {}...".format(this_val))

        # note that below code clips to axes limits before running kernel
        # so if you clip below a large amount of data, that data will be
        # ignored in the plotting (but not in the r2)
        marginal_kws = {}
        if kind == "kde":
            marginal_kws["bw"] = bw
            marginal_kws["clip"] = this_lims
            marginal_kws["cut"] = cut

        # extend the drawing of the joint distribution to the extremes of the
        # data
        joint_kws = marginal_kws.copy()
        if kind == "kde":
            joint_kws["extend"] = "both"

        with sns.axes_style("white"):
            jg = sns.jointplot(
                this_preds,
                this_truth,
                kind=kind,
                height=10,
                xlim=this_lims,
                ylim=this_lims,
                joint_kws=joint_kws,
                marginal_kws=marginal_kws,
                size=size,
                alpha=alpha,
            )

        ## add 1:1 line
        jg.ax_joint.plot(this_lims, this_lims, "k-", alpha=0.75)
        jg.ax_joint.set_xlabel("Predicted")
        jg.ax_joint.set_ylabel("Observed")
        jg.ax_joint.text(
            0.05, 0.95, "r2_score: {:.2f}".format(r2), transform=jg.ax_joint.transAxes
        )

        ## calc metrics
        plt.suptitle(
            "{} Model OOS Performance w/ k-fold CV ({})".format(
                this_val.title(), model_info.title()
            )
        )
        if save_dir:
            fig = plt.gcf()
            _savefig(
                fig,
                save_dir,
                app_name,
                this_val,
                "predVobs_kde",
                suffix,
                tight_layout=True,
            )

            kde_data = {"truth": this_truth, "preds": this_preds}
            _save_fig_data(
                kde_data, save_dir, app_name, this_val, "predVobs_kde", suffix
            )
def spatial_scatter_obs_v_pred(
    kfold_results,
    latlons,
    model_info,
    val,
    s=4,
    save_dir=None,
    app_name=None,
    suffix=None,
    figsize=(14, 5),
    crit="r2_score",
    **kwargs
):
    """Plots side-by-side spatial scatters of observed and predicted values.

    Args:
        kfold_results (dict of ndarray) :
            As returned using kfold_solve()
        latlons (nx2 2darray) : lats (first col), lons (second col)
        model_info (str) :
            To append to title of the scatter plot,
            e.g. could pass in formation about which solve...etc it was.
        val (str or list of str):
            An ordered list of names of the outcomes in this model. If not
            multiple outcomes, this can be string. Otherwise must be a list of strings
            of length n_outcomes
        lims (dict of 2-tuple) : Apply lower and upper bounds to KDE plot for a particular val.
            The format of this dict is val : (lower_bound,upper_bound). If no lim is set
            for a particular val, the default is the lower and upper bound of the observed
            and predicted outcomes combined.
        save_dir (str) : Path to directory in which to save output files. If None, no figures will be saved.
        app_name (str) : The name of the application (e.g. 'housing'). Only needed if saving
        suffix (str) : The suffix containing the grid, sample, and featurization parameters
            which will be appended to the filename when saving, in order to keep track of
            various sampling and gridding schemes. Only needed if saving
    """

    val = _adjust_val_names_str(val)

    # get metrics and preds for best HP's
    best_lambda_idx, best_metrics, best_preds = interpret_kfold_results(
        kfold_results, crits=crit
    )

    # flatten over fold predictions
    preds = np.vstack([solve.y_to_matrix(i) for i in best_preds.squeeze()])
    truth = np.vstack(
        [solve.y_to_matrix(i) for i in kfold_results["y_true_test"].squeeze()]
    )

    # get latlons in same shuffled, cross-validated order
    ll = latlons[
        np.hstack([test for train, test in kfold_results["cv"].split(latlons)])
    ]

    vmin = kwargs.pop("vmin", np.percentile(truth, 10, axis=0))
    vmax = kwargs.pop("vmin", np.percentile(truth, 90, axis=0))

    # plot obs and preds
    for vx, v in enumerate(val):
        fig, ax = plt.subplots(1, 2, figsize=figsize)
        sc0 = ax[0].scatter(
            ll[:, 1],
            ll[:, 0],
            c=truth[:, vx],
            cmap="viridis",
            alpha=1,
            s=s,
            vmin=vmin[vx],
            vmax=vmax[vx],
            **kwargs
        )
        sc1 = ax[1].scatter(
            ll[:, 1],
            ll[:, 0],
            c=preds[:, vx],
            cmap="viridis",
            alpha=1,
            s=s,
            vmin=vmin[vx],
            vmax=vmax[vx],
            **kwargs
        )
        fig.colorbar(sc0, ax=ax[0])
        fig.colorbar(sc1, ax=ax[1])
        fig.suptitle(v.title())
        ax[0].set_title("Observed")
        ax[1].set_title("Predicted")
        if save_dir:
            data = {
                "lon": ll[:, 1],
                "lat": ll[:, 0],
                "truth": truth[:, vx],
                "preds": preds[:, vx],
            }
            _savefig(fig, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix)
            _save_fig_data(
                data, save_dir, app_name, v, "outcomes_scatter_obsAndPred", suffix
            )
Exemple #5
0
def kfold_solve_no_overlap(X,
                           y,
                           solve_function=solver.ridge_regression,
                           num_folds=5,
                           return_preds=True,
                           return_model=False,
                           **kwargs_solve):

    assert num_folds > 1

    y = solver.y_to_matrix(y)
    n_outcomes = y.shape[1]
    # keep track of all runs over several iterations
    kfold_metrics_test = []
    kfold_metrics_train = []
    kfold_preds_test = []
    kfold_preds_train = []
    kfold_y_train = []
    kfold_y_test = []
    kfold_models = []
    hp_warnings = []

    print("on fold (of {0}): ".format(num_folds), end="")

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)
    split = kf.split(X)

    kf_split_idxs = []
    for i, (_, val_idxs_i) in enumerate(split):
        kf_split_idxs.append(val_idxs_i)
    val_idxs = kf_split_idxs[0]
    train_splits = kf_split_idxs[1:]

    i = 0
    for train_idxs in train_splits:
        i += 1
        print("{0} ".format(i), end="")

        X_train, X_val = X[train_idxs], X[val_idxs]
        y_train, y_val = y[train_idxs], y[val_idxs]

        # record train/test obs for this split
        this_y_train = np.empty(n_outcomes, dtype=np.ndarray)
        this_y_val = np.empty(n_outcomes, dtype=np.ndarray)
        for o in range(n_outcomes):
            this_y_train[o] = y_train[:, o]
            this_y_val[o] = y_val[:, o]
        kfold_y_train.append(this_y_train)
        kfold_y_test.append(this_y_val)

        # call solve func
        solve_results = solve_function(X_train,
                                       X_val,
                                       y_train,
                                       y_val,
                                       return_preds=return_preds,
                                       return_model=return_model,
                                       **kwargs_solve)

        # record performance metrics
        kfold_metrics_test.append(solve_results["metrics_test"])
        kfold_metrics_train.append(solve_results["metrics_train"])

        # record optional preds and model parameters
        if return_preds:
            kfold_preds_test.append(solve_results["y_pred_test"])
            kfold_preds_train.append(solve_results["y_pred_train"])
        if return_model:
            kfold_models.append(solve_results["models"])

        # recpord np warnings
        hp_warnings.append(solve_results["hp_warning"])

    # Return results
    rets = {
        "metrics_test": np.array(kfold_metrics_test),
        "metrics_train": np.array(kfold_metrics_train),
        "y_true_test": np.array(kfold_y_test),
        "y_true_train": np.array(kfold_y_train),
        "hp_warning": np.array(hp_warnings),
        "cv": kf,
    }

    if return_preds:
        rets["y_pred_test"] = np.array(kfold_preds_test)
        rets["y_pred_train"] = np.array(kfold_preds_train)

    if return_model:
        rets["models"] = np.array(kfold_models)

    return rets