def test_split() -> None: """Test outputs of subsamplings.""" X = np.array([0, 1, 2, 3]) cv = Subsample(n_resamplings=2, random_state=1) trains = np.concatenate([x[0] for x in cv.split(X)]) tests = np.concatenate([x[1] for x in cv.split(X)]) trains_expected = np.array([1, 3, 0, 0, 3, 1, 3, 1]) tests_expected = np.array([2, 0, 2]) np.testing.assert_equal(trains, trains_expected) np.testing.assert_equal(tests, tests_expected)
def test_no_agg_fx_specified_with_subsample() -> None: """Test that a warning is raised if at least one residual is nan.""" with pytest.raises(ValueError, match=r"You need to specify an aggregation*"): mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1), agg_function=None) mapie_reg.fit(X, y)
def comparison_JAB( model: BaseEstimator = Ridge2(), agg_function: str = "mean", alpha: float = 0.1, trials: int = 10, train_size: int = 200, boostrap_size: int = 10, B_fixed: int = 50, random_state: int = 98765, ) -> pd.DataFrame: """ Launch trials of jackknife-plus and jackknife-plus_after_boostrap, with B fixed and random, for a given number of resample size and a given number of trials, and returns the results as a DataFrame, Parameters ---------- model : BaseEstimator Base model. By default, Ridge2. agg_function: str Aggregation function to test. alpha : float 1 - (target coverage level). trials: int Number of trials launch for a given boostrap set size. train_size : int Size of the train set. bootstrap_size : int Number of boostrap sizes to test, uniformly distributed between 10 and 100% of the train set size. B_fixed : int Number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_fixed/(1-1/(n+1))^m),(1-1/(n+1))^m), where n is the training set size, and m the resampling set size. random_state : int Random state. By default, 98765 (from [1]). Returns ------- pd.DataFrame DataFrame with columns: - itrial : the number of the trial - model : the estimator's name - method : jackknife+ of jackknife+-after-bootsrap - coverage : PIs' coverage - width : mean PI's width - m : the resampling set size - agg_function: aggregation method """ results = pd.DataFrame( columns=["itrial", "estimator", "method", "coverage", "width", "m"], index=np.arange(trials * (2 * boostrap_size + 1)), ) (X, y) = get_X_y() m_vals = np.round(train_size * np.linspace(0.1, 1, num=boostrap_size)).astype(int) result_index = 0 for itrial in range(trials): X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_size, random_state=random_state + itrial) PIs = PIs = compute_PIs( estimator=model, X_train=X_train, y_train=y_train, X_test=X_test, method="plus", cv=-1, alpha=alpha, agg_function=agg_function, ) (coverage, width) = get_coverage_width(PIs, y_test) results.iloc[result_index, :] = [ itrial, type(model).__name__, "J+", coverage, width, 0, ] result_index += 1 for i_m, m in enumerate(m_vals): # J+aB, random B B_random = B_random_from_B_fixed(B_fixed, train_size, m, itrial=i_m) subsample_B_random = Subsample( n_resamplings=B_random, n_samples=m, replace=True, random_state=random_state, ) PIs = compute_PIs( estimator=model, X_train=X_train, y_train=y_train, X_test=X_test, method="plus", cv=subsample_B_random, alpha=alpha, agg_function=agg_function, ) (coverage, width) = get_coverage_width(PIs, y_test) results.iloc[result_index, :] = [ itrial, type(model).__name__, "J+aB Random B", coverage, width, m, ] result_index += 1 # J+aB, fixed B subsample_B_fixed = Subsample( n_resamplings=B_fixed, n_samples=m, replace=True, random_state=random_state, ) PIs = PIs = compute_PIs( estimator=model, X_train=X_train, y_train=y_train, X_test=X_test, method="plus", cv=subsample_B_fixed, alpha=alpha, agg_function=agg_function, ) (coverage, width) = get_coverage_width(PIs, y_test) results.iloc[result_index, :] = [ itrial, type(model).__name__, "J+aB Fixed B", coverage, width, m, ] result_index += 1 results["agg_function"] = agg_function results["alpha"] = alpha results = results.astype({ "itrial": int, "estimator": str, "method": str, "coverage": float, "width": float, "m": int, "agg_function": str, }) return results
def test_not_enough_resamplings() -> None: """Test that a warning is raised if at least one residual is nan.""" with pytest.warns(UserWarning, match=r"WARNING: at least one point of*"): mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1), agg_function="mean") mapie_reg.fit(X, y)
Params( method="plus", agg_function="mean", cv=KFold(n_splits=3, shuffle=True, random_state=1), ), "cv_minmax": Params( method="minmax", agg_function="mean", cv=KFold(n_splits=3, shuffle=True, random_state=1), ), "jackknife_plus_ab": Params( method="plus", agg_function="mean", cv=Subsample(n_resamplings=30, random_state=1), ), "jackknife_minmax_ab": Params( method="minmax", agg_function="mean", cv=Subsample(n_resamplings=30, random_state=1), ), "jackknife_plus_median_ab": Params( method="plus", agg_function="median", cv=Subsample( n_resamplings=30, random_state=1, ),
def test_default_parameters() -> None: """Test default values of Subsample.""" cv = Subsample() assert cv.n_resamplings == 30 assert cv.n_samples is None assert cv.random_state is None
def test_get_n_splits() -> None: """Test get_n_splits method of Subsample.""" cv = Subsample(n_resamplings=3) assert cv.get_n_splits() == 3