def test_invalid_method_in_predict(monkeypatch: Any, method: str) -> None: """Test message in predict when invalid method is selected.""" monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None) monkeypatch.setattr(MapieRegressor, "_select_cv", lambda _: LeaveOneOut()) mapie = MapieRegressor(DummyRegressor(), method=method) mapie.fit(X_boston, y_boston) with pytest.raises(ValueError, match=r".*Invalid method.*"): mapie.predict(X_boston)
def test_prediction_between_low_up(return_pred: str) -> None: """Test that prediction lies between low and up prediction intervals.""" mapie = MapieRegressor(LinearRegression(), return_pred=return_pred) mapie.fit(X_boston, y_boston) y_preds = mapie.predict(X_boston) y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2] assert (y_pred >= y_low).all() & (y_pred <= y_up).all()
def test_results(method: str) -> None: """ Test that MapieRegressor applied on a linear regression model fitted on a linear curve results in null uncertainty. """ mapie = MapieRegressor(LinearRegression(), method=method, n_splits=3) mapie.fit(X_toy, y_toy) y_preds = mapie.predict(X_toy) y_low, y_up = y_preds[:, 1], y_preds[:, 2] assert_almost_equal(y_up, y_low, 10)
def test_linreg_results(method: str) -> None: """Test expected PIs for a multivariate linear regression problem with fixed random seed.""" mapie = MapieRegressor(LinearRegression(), method=method, alpha=0.05, random_state=SEED) mapie.fit(X_reg, y_reg) y_preds = mapie.predict(X_reg) preds_low, preds_up = y_preds[:, 1], y_preds[:, 2] assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method], 2) assert_almost_equal(coverage_score(y_reg, preds_low, preds_up), expected_coverages[method], 2)
def PIs_vs_dimensions( methods: List[str], alpha: float, n_trial: int, dimensions: List[int] ) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]: """ Compute the prediction intervals for a linear regression problem. Function adapted from Foygel-Barber et al. (2020). It generates several times linear data with random noise whose signal-to-noise is equal to 10 and for several given dimensions, given by the dimensions list. Here we use MAPIE, with a LinearRegression base model, to estimate the width means and the coverage levels of the prediction intervals estimated by all the available methods as function of the dataset dimension. This simulation is carried out to emphasize the instability of the prediction intervals estimated by the Jackknife method when the dataset dimension is equal to the number of training samples (here 100). Parameters ---------- methods : List[str] List of methods for estimating prediction intervals. alpha : float 1 - (target coverage level). n_trial : int Number of trials for each dimension for estimating prediction intervals. For each trial, a new random noise is generated. dimensions : List[int] List of dimension values of input data. Returns ------- Dict[str, Dict[int, Dict[str, np.ndarray]]] Prediction interval widths and coverages for each method, trial, and dimension value. """ n_train = 100 n_test = 100 SNR = 10 results: Dict[str, Dict[int, Dict[str, np.ndarray]]] = { method: { dimension: { "coverage": np.empty(n_trial), "width_mean": np.empty(n_trial) } for dimension in dimensions } for method in methods } for dimension in dimensions: for trial in range(n_trial): beta = np.random.normal(size=dimension) beta_norm = np.sqrt((beta**2).sum()) beta = beta/beta_norm*np.sqrt(SNR) X_train = np.random.normal(size=(n_train, dimension)) noise_train = np.random.normal(size=n_train) noise_test = np.random.normal(size=n_test) y_train = X_train.dot(beta) + noise_train X_test = np.random.normal(size=(n_test, dimension)) y_test = X_test.dot(beta) + noise_test for method in methods: mapie = MapieRegressor( LinearRegression(), alpha=alpha, method=method, n_splits=5, shuffle=False, return_pred="ensemble" ) mapie.fit(X_train, y_train) y_preds = mapie.predict(X_test) results[method][dimension]["coverage"][trial] = coverage_score( y_test, y_preds[:, 1], y_preds[:, 2] ) results[method][dimension]["width_mean"][trial] = ( y_preds[:, 2] - y_preds[:, 1] ).mean() return results
def test_not_fitted() -> None: """Test error message when predict is called before fit.""" mapie = MapieRegressor(DummyRegressor()) with pytest.raises(NotFittedError, match=r".*not fitted.*"): mapie.predict(X_reg)
def test_predicted() -> None: """Test that predict does not crash.""" mapie = MapieRegressor(DummyRegressor()) mapie.fit(X_reg, y_reg) mapie.predict(X_reg)
def test_predinterv_outputshape() -> None: """Test that number of observations given by predict method is equal to input data.""" mapie = MapieRegressor(DummyRegressor()) mapie.fit(X_reg, y_reg) assert mapie.predict(X_reg).shape[0] == X_reg.shape[0] assert mapie.predict(X_reg).shape[1] == 3
ax.plot(X_test, y_pred, label='Prediction intervals') ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3) ax.set_title(title) ax.legend() X_train, y_train, X_test, y_test, y_test_sigma = get_homoscedastic_data( n_samples=200, n_test=200, sigma=0.1) polyn_model = Pipeline([('poly', PolynomialFeatures(degree=4)), ('linear', LinearRegression(fit_intercept=False))]) methods = [ 'jackknife', 'jackknife_plus', 'jackknife_minmax', 'cv', 'cv_plus', 'cv_minmax' ] fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(3 * 6, 12)) axs = [ax1, ax2, ax3, ax4, ax5, ax6] for i, method in enumerate(methods): mapie = MapieRegressor(polyn_model, method=method, alpha=0.05, n_splits=10, return_pred='ensemble') mapie.fit(X_train.reshape(-1, 1), y_train) y_preds = mapie.predict(X_test.reshape(-1, 1)) plot_1d_data(X_train, y_train, X_test, y_test, y_test_sigma, y_preds[:, 0], y_preds[:, 1], y_preds[:, 2], axs[i], method)
cv=n_cv, scoring="neg_root_mean_squared_error", return_train_score=True, verbose=0, n_jobs=-1, random_state=random_state) cv_obj.fit(X_train, y_train) best_est = cv_obj.best_estimator_ mapie_non_nested = MapieRegressor(best_est, alpha=alpha, method='cv_plus', n_splits=n_cv, return_pred='median', random_state=random_state) mapie_non_nested.fit(X_train, y_train) y_preds_non_nested = mapie_non_nested.predict(X_test) widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1] coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1], y_preds_non_nested[:, 2]) score_non_nested = mean_squared_error(y_test, y_preds_non_nested[:, 0], squared=False) # Nested approach with the CV+ method using the Random Forest model. cv_obj = RandomizedSearchCV(rf_model, param_distributions=rf_params, n_iter=n_iter, cv=n_cv, scoring="neg_root_mean_squared_error", return_train_score=True, verbose=0,
An example plot of :class:`mapie.estimators.MapieRegressor` used in the Quickstart. """ import numpy as np from matplotlib import pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.datasets import make_regression from mapie.estimators import MapieRegressor from mapie.metrics import coverage_score regressor = LinearRegression() X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59) mapie = MapieRegressor(regressor, method="jackknife_plus") mapie.fit(X, y) y_preds = mapie.predict(X) plt.xlabel('x') plt.ylabel('y') plt.scatter(X, y, alpha=0.3) plt.plot(X, y_preds[:, 0], color='C1') order = np.argsort(X[:, 0]) plt.fill_between(X[order].ravel(), y_preds[:, 1][order], y_preds[:, 2][order], alpha=0.3) plt.title( f"Target coverage = 0.9; Effective coverage = {coverage_score(y, y_preds[:, 1], y_preds[:, 2])}" ) plt.show()