Beispiel #1
0
def test_no_agg_fx_specified_with_subsample() -> None:
    """Test that a warning is raised if at least one residual is nan."""
    with pytest.raises(ValueError,
                       match=r"You need to specify an aggregation*"):
        mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1),
                                   agg_function=None)
        mapie_reg.fit(X, y)
Beispiel #2
0
def test_prediction_between_low_up(strategy: str) -> None:
    """Test that prediction lies between low and up prediction intervals."""
    mapie = MapieRegressor(**STRATEGIES[strategy])
    mapie.fit(X, y)
    y_pred, y_pis = mapie.predict(X, alpha=0.1)
    assert (y_pred >= y_pis[:, 0, 0]).all()
    assert (y_pred <= y_pis[:, 1, 0]).all()
Beispiel #3
0
def test_too_large_cv(cv: Any) -> None:
    """Test that too large cv raise sklearn errors."""
    mapie_reg = MapieRegressor(cv=cv)
    with pytest.raises(
            ValueError,
            match=rf".*Cannot have number of splits n_splits={cv} greater.*",
    ):
        mapie_reg.fit(X_toy, y_toy)
Beispiel #4
0
def test_valid_estimator(strategy: str) -> None:
    """Test that valid estimators are not corrupted, for all strategies."""
    mapie_reg = MapieRegressor(estimator=DummyRegressor(),
                               **STRATEGIES[strategy])
    mapie_reg.fit(X_toy, y_toy)
    assert isinstance(mapie_reg.single_estimator_, DummyRegressor)
    for estimator in mapie_reg.estimators_:
        assert isinstance(estimator, DummyRegressor)
Beispiel #5
0
def test_results_for_ordered_alpha(strategy: str) -> None:
    """
    Test that prediction intervals lower (upper) bounds give
    consistent results for ordered alphas.
    """
    mapie = MapieRegressor(**STRATEGIES[strategy])
    mapie.fit(X, y)
    y_pred, y_pis = mapie.predict(X, alpha=[0.05, 0.1])
    assert (y_pis[:, 0, 0] <= y_pis[:, 0, 1]).all()
    assert (y_pis[:, 1, 0] >= y_pis[:, 1, 1]).all()
Beispiel #6
0
def test_linear_data_confidence_interval(strategy: str) -> None:
    """
    Test that MapieRegressor applied on a linear regression model
    fitted on a linear curve results in null uncertainty.
    """
    mapie = MapieRegressor(**STRATEGIES[strategy])
    mapie.fit(X_toy, y_toy)
    y_pred, y_pis = mapie.predict(X_toy, alpha=0.2)
    np.testing.assert_allclose(y_pis[:, 0, 0], y_pis[:, 1, 0])
    np.testing.assert_allclose(y_pred, y_pis[:, 0, 0])
Beispiel #7
0
def test_results_for_same_alpha(strategy: str) -> None:
    """
    Test that predictions and intervals
    are similar with two equal values of alpha.
    """
    mapie_reg = MapieRegressor(**STRATEGIES[strategy])
    mapie_reg.fit(X, y)
    _, y_pis = mapie_reg.predict(X, alpha=[0.1, 0.1])
    np.testing.assert_allclose(y_pis[:, 0, 0], y_pis[:, 0, 1])
    np.testing.assert_allclose(y_pis[:, 1, 0], y_pis[:, 1, 1])
Beispiel #8
0
def test_predict_output_shape(strategy: str, alpha: Any,
                              dataset: Tuple[NDArray, NDArray]) -> None:
    """Test predict output shape."""
    mapie_reg = MapieRegressor(**STRATEGIES[strategy])
    (X, y) = dataset
    mapie_reg.fit(X, y)
    y_pred, y_pis = mapie_reg.predict(X, alpha=alpha)
    n_alpha = len(alpha) if hasattr(alpha, "__len__") else 1
    assert y_pred.shape == (X.shape[0], )
    assert y_pis.shape == (X.shape[0], 2, n_alpha)
Beispiel #9
0
def test_invalid_agg_function(agg_function: Any) -> None:
    """Test that invalid agg_functions raise errors."""
    mapie_reg = MapieRegressor(agg_function=agg_function)
    with pytest.raises(ValueError, match=r".*Invalid aggregation function.*"):
        mapie_reg.fit(X_toy, y_toy)

    mapie_reg = MapieRegressor(agg_function=None)
    with pytest.raises(ValueError, match=r".*If ensemble is True*"):
        mapie_reg.fit(X_toy, y_toy)
        mapie_reg.predict(X_toy, ensemble=True)
Beispiel #10
0
def test_pred_loof_isnan() -> None:
    """Test that if validation set is empty then prediction is empty."""
    mapie_reg = MapieRegressor()
    y_pred: ArrayLike
    _, y_pred, _ = mapie_reg._fit_and_predict_oof_model(
        estimator=LinearRegression(),
        X=X_toy,
        y=y_toy,
        train_index=[0, 1, 2, 3, 4],
        val_index=[],
    )
    assert len(y_pred) == 0
def compute_PIs(
    estimator: BaseEstimator,
    X_train: NDArray,
    y_train: NDArray,
    X_test: NDArray,
    method: str,
    cv: Any,
    alpha: float,
    agg_function: Optional[str] = None,
) -> pd.DataFrame:
    """
    Train and test a model with a MAPIE method,
    and return a DataFrame of upper and lower bounds of the predictions
    on the test set.

    Parameters
    ----------
    estimator : BaseEstimator
        Base model to fit.
    X_train : NDArray
        Features of training set.
    y_train : NDArray
        Target of training set.
    X_test : NDArray
        Features of testing set.
    method : str
        Method for estimating prediction intervals.
    cv : Any
        Strategy for computing residuals.
    alpha : float
        1 - (target coverage level).
    agg_function: str
        'mean' or 'median'.
        Function to aggregate the predictions of the B estimators.

    Returns
    -------
    pd.DataFrame
        DataFrame of upper and lower predictions.
    """
    mapie_estimator = MapieRegressor(
        estimator=estimator,
        method=method,
        cv=cv,
        n_jobs=-1,
        agg_function=agg_function,
    )

    mapie_estimator = mapie_estimator.fit(X=X_train, y=y_train)
    _, y_pis = mapie_estimator.predict(X=X_test, alpha=alpha)
    PI = np.c_[y_pis[:, 0, 0], y_pis[:, 1, 0]]
    return pd.DataFrame(PI, columns=["lower", "upper"])
Beispiel #12
0
def test_results_single_and_multi_jobs(strategy: str) -> None:
    """
    Test that MapieRegressor gives equal predictions
    regardless of number of parallel jobs.
    """
    mapie_single = MapieRegressor(n_jobs=1, **STRATEGIES[strategy])
    mapie_multi = MapieRegressor(n_jobs=-1, **STRATEGIES[strategy])
    mapie_single.fit(X_toy, y_toy)
    mapie_multi.fit(X_toy, y_toy)
    y_pred_single, y_pis_single = mapie_single.predict(X_toy, alpha=0.2)
    y_pred_multi, y_pis_multi = mapie_multi.predict(X_toy, alpha=0.2)
    np.testing.assert_allclose(y_pred_single, y_pred_multi)
    np.testing.assert_allclose(y_pis_single, y_pis_multi)
Beispiel #13
0
def test_results_prefit_ignore_method() -> None:
    """Test that method is ignored when ``cv="prefit"``."""
    estimator = LinearRegression().fit(X, y)
    all_y_pis: List[NDArray] = []
    for method in METHODS:
        mapie_reg = MapieRegressor(estimator=estimator,
                                   cv="prefit",
                                   method=method)
        mapie_reg.fit(X, y)
        _, y_pis = mapie_reg.predict(X, alpha=0.1)
        all_y_pis.append(y_pis)
    for y_pis1, y_pis2 in combinations(all_y_pis, 2):
        np.testing.assert_allclose(y_pis1, y_pis2)
Beispiel #14
0
def test_results_prefit_naive() -> None:
    """
    Test that prefit, fit and predict on the same dataset
    is equivalent to the "naive" method.
    """
    estimator = LinearRegression().fit(X, y)
    mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
    mapie_reg.fit(X, y)
    _, y_pis = mapie_reg.predict(X, alpha=0.05)
    width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
    coverage = regression_coverage_score(y, y_pis[:, 0, 0], y_pis[:, 1, 0])
    np.testing.assert_allclose(width_mean, WIDTHS["naive"], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES["naive"], rtol=1e-2)
Beispiel #15
0
def test_linear_regression_results(strategy: str) -> None:
    """
    Test expected prediction intervals for
    a multivariate linear regression problem
    with fixed random state.
    """
    mapie = MapieRegressor(**STRATEGIES[strategy])
    mapie.fit(X, y)
    _, y_pis = mapie.predict(X, alpha=0.05)
    y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
    width_mean = (y_pred_up - y_pred_low).mean()
    coverage = regression_coverage_score(y, y_pred_low, y_pred_up)
    np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2)
Beispiel #16
0
def test_results_prefit() -> None:
    """Test prefit results on a standard train/validation/test split."""
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                                y,
                                                                test_size=1 /
                                                                10,
                                                                random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=1 / 9,
                                                      random_state=1)
    estimator = LinearRegression().fit(X_train, y_train)
    mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
    mapie_reg.fit(X_val, y_val)
    _, y_pis = mapie_reg.predict(X_test, alpha=0.05)
    width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
    coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1,
                                                                       0])
    np.testing.assert_allclose(width_mean, WIDTHS["prefit"], rtol=1e-2)
    np.testing.assert_allclose(coverage, COVERAGES["prefit"], rtol=1e-2)
Beispiel #17
0
def test_aggregate_with_mask_with_prefit() -> None:
    """
    Test ``aggregate_with_mask`` in case ``cv`` is ``"prefit"``.
    """
    mapie_reg = MapieRegressor(cv="prefit")
    with pytest.raises(
            ValueError,
            match=r".*There should not be aggregation of predictions if cv is*",
    ):
        mapie_reg.aggregate_with_mask(k, k)

    mapie_reg = MapieRegressor(agg_function="nonsense")
    with pytest.raises(
            ValueError,
            match=r".*The value of self.agg_function is not correct*",
    ):
        mapie_reg.aggregate_with_mask(k, k)
Beispiel #18
0
def test_results_for_alpha_as_float_and_arraylike(strategy: str,
                                                  alpha: Any) -> None:
    """Test that output values do not depend on type of alpha."""
    mapie_reg = MapieRegressor(**STRATEGIES[strategy])
    mapie_reg.fit(X, y)
    y_pred_float1, y_pis_float1 = mapie_reg.predict(X, alpha=alpha[0])
    y_pred_float2, y_pis_float2 = mapie_reg.predict(X, alpha=alpha[1])
    y_pred_array, y_pis_array = mapie_reg.predict(X, alpha=alpha)
    np.testing.assert_allclose(y_pred_float1, y_pred_array)
    np.testing.assert_allclose(y_pred_float2, y_pred_array)
    np.testing.assert_allclose(y_pis_float1[:, :, 0], y_pis_array[:, :, 0])
    np.testing.assert_allclose(y_pis_float2[:, :, 0], y_pis_array[:, :, 1])
Beispiel #19
0
def test_prediction_agg_function(method: str, agg_function: str) -> None:
    """
    Test that predictions differ when ensemble is True/False,
    but not prediction intervals.
    """
    mapie = MapieRegressor(method=method, cv=2, agg_function=agg_function)
    mapie.fit(X, y)
    y_pred_1, y_pis_1 = mapie.predict(X, ensemble=True, alpha=0.1)
    y_pred_2, y_pis_2 = mapie.predict(X, ensemble=False, alpha=0.1)
    np.testing.assert_allclose(y_pis_1[:, 0, 0], y_pis_2[:, 0, 0])
    np.testing.assert_allclose(y_pis_1[:, 1, 0], y_pis_2[:, 1, 0])
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(y_pred_1, y_pred_2)
Beispiel #20
0
def test_pipeline_compatibility() -> None:
    """Check that MAPIE works on pipeline based on pandas dataframes"""
    X = pd.DataFrame({
        "x_cat": ["A", "A", "B", "A", "A", "B"],
        "x_num": [0, 1, 1, 4, np.nan, 5],
        "y": [5, 7, 3, 9, 10, 8]
    })
    y = pd.Series([5, 7, 3, 9, 10, 8])
    numeric_preprocessor = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
    ])
    categorical_preprocessor = Pipeline(
        steps=[("encoding", OneHotEncoder(handle_unknown="ignore"))])
    preprocessor = ColumnTransformer([
        ("cat", categorical_preprocessor, ["x_cat"]),
        ("num", numeric_preprocessor, ["x_num"])
    ])
    pipe = make_pipeline(preprocessor, LinearRegression())
    mapie = MapieRegressor(pipe)
    mapie.fit(X, y)
    mapie.predict(X)
Beispiel #21
0
Params = TypedDict("Params", {"method": str, "cv": int})
STRATEGIES = {
    "jackknife": Params(method="base", cv=-1),
    "jackknife_plus": Params(method="plus", cv=-1),
    "jackknife_minmax": Params(method="minmax", cv=-1),
    "cv": Params(method="base", cv=10),
    "cv_plus": Params(method="plus", cv=10),
    "cv_minmax": Params(method="minmax", cv=10),
}
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,
                                                       3,
                                                       figsize=(3 * 6, 12))
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
for i, (strategy, params) in enumerate(STRATEGIES.items()):
    mapie = MapieRegressor(polyn_model,
                           agg_function="median",
                           n_jobs=-1,
                           **params)
    mapie.fit(X_train.reshape(-1, 1), y_train)
    y_pred, y_pis = mapie.predict(
        X_test.reshape(-1, 1),
        alpha=0.05,
    )
    plot_1d_data(
        X_train,
        y_train,
        X_test,
        y_test,
        y_test_sigma,
        y_pred,
        y_pis[:, 0, 0],
        y_pis[:, 1, 0],
Beispiel #22
0
cv_obj = RandomizedSearchCV(
    rf_model,
    param_distributions=rf_params,
    n_iter=n_iter,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    return_train_score=True,
    verbose=0,
    random_state=random_state,
    n_jobs=-1,
)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  method="plus",
                                  cv=cv,
                                  agg_function="median",
                                  n_jobs=-1)
mapie_non_nested.fit(X_train, y_train)
y_pred_non_nested, y_pis_non_nested = mapie_non_nested.predict(X_test,
                                                               alpha=alpha)
widths_non_nested = y_pis_non_nested[:, 1, 0] - y_pis_non_nested[:, 0, 0]
coverage_non_nested = regression_coverage_score(y_test, y_pis_non_nested[:, 0,
                                                                         0],
                                                y_pis_non_nested[:, 1, 0])
score_non_nested = mean_squared_error(y_test, y_pred_non_nested, squared=False)

# Nested approach with the CV+ strategy using the Random Forest model.
cv_obj = RandomizedSearchCV(
    rf_model,
    param_distributions=rf_params,
Beispiel #23
0
def test_valid_agg_function(agg_function: str) -> None:
    """Test that valid agg_functions raise no errors."""
    mapie_reg = MapieRegressor(agg_function=agg_function)
    mapie_reg.fit(X_toy, y_toy)
Beispiel #24
0
polyn_model = Pipeline([
    ("poly", PolynomialFeatures(degree=degree_polyn)),
    ("linear", LinearRegression()),
])

# Estimating prediction intervals
Params = TypedDict("Params", {"method": str, "cv": int})
STRATEGIES = {
    "jackknife_plus": Params(method="plus", cv=-1),
    "jackknife_minmax": Params(method="minmax", cv=-1),
    "cv_plus": Params(method="plus", cv=10),
    "cv_minmax": Params(method="minmax", cv=10),
}
y_pred, y_pis = {}, {}
for strategy, params in STRATEGIES.items():
    mapie = MapieRegressor(polyn_model, **params)
    mapie.fit(X_train, y_train)
    y_pred[strategy], y_pis[strategy] = mapie.predict(X_test, alpha=0.05)


# Visualization
def plot_1d_data(
    X_train: NDArray,
    y_train: NDArray,
    X_test: NDArray,
    y_test: NDArray,
    y_sigma: float,
    y_pred: NDArray,
    y_pred_low: NDArray,
    y_pred_up: NDArray,
    ax: plt.Axes,
Beispiel #25
0
def test_valid_method(method: str) -> None:
    """Test that valid methods raise no errors."""
    mapie_reg = MapieRegressor(method=method)
    mapie_reg.fit(X_toy, y_toy)
    check_is_fitted(mapie_reg, mapie_reg.fit_attributes)
Beispiel #26
0
y = f(X) + np.random.normal(0, sigma, n_samples)

# Train/validation/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1 / 10)
X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                  y_train_val,
                                                  test_size=1 / 9)

# Train model on training set
model = MLPRegressor(activation="relu", random_state=1)
model.fit(X_train.reshape(-1, 1), y_train)

# Calibrate uncertainties on validation set
mapie = MapieRegressor(model, cv="prefit")
mapie.fit(X_val.reshape(-1, 1), y_val)

# Evaluate prediction and coverage level on testing set
alpha = 0.1
y_pred, y_pis = mapie.predict(X_test.reshape(-1, 1), alpha=alpha)
y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
coverage = regression_coverage_score(y_test, y_pred_low, y_pred_up)

# Plot obtained prediction intervals on testing set
theoretical_semi_width = scipy.stats.norm.ppf(1 - alpha) * sigma
y_test_theoretical = f(X_test)
order = np.argsort(X_test)

plt.scatter(X_test, y_test, color="red", alpha=0.3, label="testing", s=2)
plt.plot(
Beispiel #27
0
An example plot of :class:`mapie.regression.MapieRegressor` used
in the Quickstart.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score

regressor = LinearRegression()
X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59)

alpha = [0.05, 0.32]
mapie = MapieRegressor(regressor, method="plus")
mapie.fit(X, y)
y_pred, y_pis = mapie.predict(X, alpha=alpha)

coverage_scores = [
    regression_coverage_score(y, y_pis[:, 0, i], y_pis[:, 1, i])
    for i, _ in enumerate(alpha)
]

plt.xlabel("x")
plt.ylabel("y")
plt.scatter(X, y, alpha=0.3)
plt.plot(X, y_pred, color="C1")
order = np.argsort(X[:, 0])
plt.plot(X[order], y_pis[order][:, 0, 1], color="C1", ls="--")
plt.plot(X[order], y_pis[order][:, 1, 1], color="C1", ls="--")
Beispiel #28
0
def test_not_enough_resamplings() -> None:
    """Test that a warning is raised if at least one residual is nan."""
    with pytest.warns(UserWarning, match=r"WARNING: at least one point of*"):
        mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1),
                                   agg_function="mean")
        mapie_reg.fit(X, y)
Beispiel #29
0
def test_valid_cv(cv: Any) -> None:
    """Test that valid cv raise no errors."""
    mapie = MapieRegressor(cv=cv)
    mapie.fit(X_toy, y_toy)
Beispiel #30
0
def test_results_with_constant_sample_weights(strategy: str) -> None:
    """
    Test predictions when sample weights are None
    or constant with different values.
    """
    n_samples = len(X)
    mapie0 = MapieRegressor(**STRATEGIES[strategy])
    mapie1 = MapieRegressor(**STRATEGIES[strategy])
    mapie2 = MapieRegressor(**STRATEGIES[strategy])
    mapie0.fit(X, y, sample_weight=None)
    mapie1.fit(X, y, sample_weight=np.ones(shape=n_samples))
    mapie2.fit(X, y, sample_weight=np.ones(shape=n_samples) * 5)
    y_pred0, y_pis0 = mapie0.predict(X, alpha=0.05)
    y_pred1, y_pis1 = mapie1.predict(X, alpha=0.05)
    y_pred2, y_pis2 = mapie2.predict(X, alpha=0.05)
    np.testing.assert_allclose(y_pred0, y_pred1)
    np.testing.assert_allclose(y_pred1, y_pred2)
    np.testing.assert_allclose(y_pis0, y_pis1)
    np.testing.assert_allclose(y_pis1, y_pis2)