def get_coverage_width(PIs: pd.DataFrame, y: NDArray) -> Tuple[float, float]:
    """
    Computes the mean coverage and width of the predictions intervals of a
    DataFrame given by the ``compute_PIs`` function

    Parameters
    ----------
    PIs : pd.DataFrame
        DataFrame returned by `compute_PIs``, with lower and upper bounds of
        the PIs.

    y : NDArray
        Targets supposedly covered by the PIs.

    Returns
    -------
    (coverage, width) : Tuple[float, float]
        The mean coverage and width of the PIs.
    """
    coverage = regression_coverage_score(y_true=y,
                                         y_pred_low=PIs["lower"],
                                         y_pred_up=PIs["upper"])
    width = regression_mean_width_score(y_pred_low=PIs["lower"],
                                        y_pred_up=PIs["upper"])
    return (coverage, width)
Beispiel #2
0
def test_results_prefit_naive() -> None:
    """
    Test that prefit, fit and predict on the same dataset
    is equivalent to the "naive" method.
    """
    estimator = LinearRegression().fit(X, y)
    mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
    mapie_reg.fit(X, y)
    _, y_pis = mapie_reg.predict(X, alpha=0.05)
    width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
    coverage = regression_coverage_score(y, y_pis[:, 0, 0], y_pis[:, 1, 0])
    np.testing.assert_allclose(width_mean, WIDTHS["naive"], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES["naive"], rtol=1e-2)
Beispiel #3
0
def test_linear_regression_results(strategy: str) -> None:
    """
    Test expected prediction intervals for
    a multivariate linear regression problem
    with fixed random state.
    """
    mapie = MapieRegressor(**STRATEGIES[strategy])
    mapie.fit(X, y)
    _, y_pis = mapie.predict(X, alpha=0.05)
    y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
    width_mean = (y_pred_up - y_pred_low).mean()
    coverage = regression_coverage_score(y, y_pred_low, y_pred_up)
    np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2)
Beispiel #4
0
def test_results_prefit() -> None:
    """Test prefit results on a standard train/validation/test split."""
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                                y,
                                                                test_size=1 /
                                                                10,
                                                                random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=1 / 9,
                                                      random_state=1)
    estimator = LinearRegression().fit(X_train, y_train)
    mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
    mapie_reg.fit(X_val, y_val)
    _, y_pis = mapie_reg.predict(X_test, alpha=0.05)
    width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
    coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1,
                                                                       0])
    np.testing.assert_allclose(width_mean, WIDTHS["prefit"], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES["prefit"], rtol=1e-2)
Beispiel #5
0
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score

regressor = LinearRegression()
X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59)

alpha = [0.05, 0.32]
mapie = MapieRegressor(regressor, method="plus")
mapie.fit(X, y)
y_pred, y_pis = mapie.predict(X, alpha=alpha)

coverage_scores = [
    regression_coverage_score(y, y_pis[:, 0, i], y_pis[:, 1, i])
    for i, _ in enumerate(alpha)
]

plt.xlabel("x")
plt.ylabel("y")
plt.scatter(X, y, alpha=0.3)
plt.plot(X, y_pred, color="C1")
order = np.argsort(X[:, 0])
plt.plot(X[order], y_pis[order][:, 0, 1], color="C1", ls="--")
plt.plot(X[order], y_pis[order][:, 1, 1], color="C1", ls="--")
plt.fill_between(
    X[order].ravel(),
    y_pis[order][:, 0, 0].ravel(),
    y_pis[order][:, 1, 0].ravel(),
    alpha=0.2,
Beispiel #6
0
                                                  y_train_val,
                                                  test_size=1 / 9)

# Train model on training set
model = MLPRegressor(activation="relu", random_state=1)
model.fit(X_train.reshape(-1, 1), y_train)

# Calibrate uncertainties on validation set
mapie = MapieRegressor(model, cv="prefit")
mapie.fit(X_val.reshape(-1, 1), y_val)

# Evaluate prediction and coverage level on testing set
alpha = 0.1
y_pred, y_pis = mapie.predict(X_test.reshape(-1, 1), alpha=alpha)
y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
coverage = regression_coverage_score(y_test, y_pred_low, y_pred_up)

# Plot obtained prediction intervals on testing set
theoretical_semi_width = scipy.stats.norm.ppf(1 - alpha) * sigma
y_test_theoretical = f(X_test)
order = np.argsort(X_test)

plt.scatter(X_test, y_test, color="red", alpha=0.3, label="testing", s=2)
plt.plot(
    X_test[order],
    y_test_theoretical[order],
    color="gray",
    label="True confidence intervals",
)
plt.plot(
    X_test[order],
Beispiel #7
0
    random_state=random_state,
    n_jobs=-1,
)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  method="plus",
                                  cv=cv,
                                  agg_function="median",
                                  n_jobs=-1)
mapie_non_nested.fit(X_train, y_train)
y_pred_non_nested, y_pis_non_nested = mapie_non_nested.predict(X_test,
                                                               alpha=alpha)
widths_non_nested = y_pis_non_nested[:, 1, 0] - y_pis_non_nested[:, 0, 0]
coverage_non_nested = regression_coverage_score(y_test, y_pis_non_nested[:, 0,
                                                                         0],
                                                y_pis_non_nested[:, 1, 0])
score_non_nested = mean_squared_error(y_test, y_pred_non_nested, squared=False)

# Nested approach with the CV+ strategy using the Random Forest model.
cv_obj = RandomizedSearchCV(
    rf_model,
    param_distributions=rf_params,
    n_iter=n_iter,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    return_train_score=True,
    verbose=0,
    random_state=random_state,
    n_jobs=-1,
)
Beispiel #8
0
def PIs_vs_dimensions(
    strategies: Dict[str, Any],
    alpha: float,
    n_trial: int,
    dimensions: NDArray,
) -> Dict[str, Dict[int, Dict[str, NDArray]]]:
    """
    Compute the prediction intervals for a linear regression problem.
    Function adapted from Foygel-Barber et al. (2020).

    It generates several times linear data with random noise whose
    signal-to-noise     is equal to 10 and for several given dimensions,
    given by the dimensions list.

    Here we use MAPIE, with a LinearRegression base model, to estimate
    the width means and the coverage levels of the prediction intervals
    estimated by all the available strategies as a function of
    the dataset dimension.

    This simulation is carried out to emphasize the instability
    of the prediction intervals estimated by the Jackknife strategy
    when the dataset dimension is equal to the number
    of training samples (here 100).

    Parameters
    ----------
    strategies : Dict[str, Dict[str, Any]]
        List of strategies for estimating prediction intervals,
        with corresponding parameters.
    alpha : float
        1 - (target coverage level).
    n_trial : int
        Number of trials for each dimension for estimating
        prediction intervals.
        For each trial, a new random noise is generated.
    dimensions : List[int]
        List of dimension values of input data.

    Returns
    -------
    Dict[str, Dict[int, Dict[str, NDArray]]]
        Prediction interval widths and coverages for each strategy, trial,
        and dimension value.
    """
    n_train = 100
    n_test = 100
    SNR = 10
    results: Dict[str, Dict[int, Dict[str, NDArray]]] = {
        strategy: {
            dimension: {
                "coverage": np.empty(n_trial),
                "width_mean": np.empty(n_trial),
            }
            for dimension in dimensions
        }
        for strategy in strategies
    }
    for dimension in dimensions:
        for trial in range(n_trial):
            beta = np.random.normal(size=dimension)
            beta_norm = np.sqrt(np.square(beta).sum())
            beta = beta / beta_norm * np.sqrt(SNR)
            X_train = np.random.normal(size=(n_train, dimension))
            noise_train = np.random.normal(size=n_train)
            noise_test = np.random.normal(size=n_test)
            y_train = X_train.dot(beta) + noise_train
            X_test = np.random.normal(size=(n_test, dimension))
            y_test = X_test.dot(beta) + noise_test

            for strategy, params in strategies.items():
                mapie = MapieRegressor(LinearRegression(),
                                       agg_function="median",
                                       n_jobs=-1,
                                       **params)
                mapie.fit(X_train, y_train)
                _, y_pis = mapie.predict(X_test, alpha=alpha)
                coverage = regression_coverage_score(y_test, y_pis[:, 0, 0],
                                                     y_pis[:, 1, 0])
                results[strategy][dimension]["coverage"][trial] = coverage
                width_mean = regression_mean_width_score(
                    y_pis[:, 0, 0], y_pis[:, 1, 0])
                results[strategy][dimension]["width_mean"][trial] = width_mean
    return results
Beispiel #9
0
def test_regression_ypredup_type_coverage_score() -> None:
    "Test that list(y_pred_up) gives right coverage."
    scr = regression_coverage_score(y_toy, y_preds[:, 1], list(y_preds[:, 2]))
    assert scr == 0.8
Beispiel #10
0
def test_regression_ytrue_type_coverage_score() -> None:
    "Test that list(y_true) gives right coverage."
    scr = regression_coverage_score(list(y_toy), y_preds[:, 1], y_preds[:, 2])
    assert scr == 0.8
Beispiel #11
0
def test_regression_toydata_coverage_score() -> None:
    "Test coverage_score for toy data."
    scr = regression_coverage_score(y_toy, y_preds[:, 1], y_preds[:, 2])
    assert scr == 0.8
Beispiel #12
0
def test_regression_same_length() -> None:
    "Test when y_true and y_preds have different lengths."
    with pytest.raises(ValueError, match=r".*could not be broadcast*"):
        regression_coverage_score(y_toy, y_preds[:-1, 1], y_preds[:-1, 2])
    with pytest.raises(ValueError, match=r".*y should be a 1d array*"):
        regression_mean_width_score(y_preds[:, :2], y_preds[:, 2])
Beispiel #13
0
def test_regression_ypredup_shape() -> None:
    "Test shape of y_pred_up."
    with pytest.raises(ValueError, match=r".*y should be a 1d array*"):
        regression_coverage_score(y_toy, y_preds[:, 1], y_preds[:, 1:])
    with pytest.raises(ValueError, match=r".*y should be a 1d array*"):
        regression_mean_width_score(y_preds[:, :2], y_preds[:, 2])