Esempio n. 1
0
def test_split() -> None:
    """Test outputs of subsamplings."""
    X = np.array([0, 1, 2, 3])
    cv = Subsample(n_resamplings=2, random_state=1)
    trains = np.concatenate([x[0] for x in cv.split(X)])
    tests = np.concatenate([x[1] for x in cv.split(X)])
    trains_expected = np.array([1, 3, 0, 0, 3, 1, 3, 1])
    tests_expected = np.array([2, 0, 2])
    np.testing.assert_equal(trains, trains_expected)
    np.testing.assert_equal(tests, tests_expected)
Esempio n. 2
0
def test_no_agg_fx_specified_with_subsample() -> None:
    """Test that a warning is raised if at least one residual is nan."""
    with pytest.raises(ValueError,
                       match=r"You need to specify an aggregation*"):
        mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1),
                                   agg_function=None)
        mapie_reg.fit(X, y)
Esempio n. 3
0
def comparison_JAB(
    model: BaseEstimator = Ridge2(),
    agg_function: str = "mean",
    alpha: float = 0.1,
    trials: int = 10,
    train_size: int = 200,
    boostrap_size: int = 10,
    B_fixed: int = 50,
    random_state: int = 98765,
) -> pd.DataFrame:
    """
    Launch trials of jackknife-plus and jackknife-plus_after_boostrap,
    with B fixed and random, for a given number of resample size and a given
    number of trials, and returns the results as a DataFrame,

    Parameters
    ----------
    model : BaseEstimator
        Base model. By default, Ridge2.
    agg_function: str
        Aggregation function to test.
    alpha : float
        1 - (target coverage level).
    trials: int
        Number of trials launch for a given boostrap set size.
    train_size : int
        Size of the train set.
    bootstrap_size : int
        Number of boostrap sizes to test,
        uniformly distributed between 10 and 100%
        of the train set size.
    B_fixed : int
        Number of bootstrap samples in J+aB is drawn as
        B ~ Binomial(int(B_fixed/(1-1/(n+1))^m),(1-1/(n+1))^m),
        where n is the training set size, and m the resampling set size.
    random_state : int
        Random state. By default, 98765 (from [1]).

    Returns
    -------
    pd.DataFrame
        DataFrame with columns:
        - itrial : the number of the trial
        - model : the estimator's name
        - method : jackknife+ of jackknife+-after-bootsrap
        - coverage : PIs' coverage
        - width : mean PI's width
        - m : the resampling set size
        - agg_function: aggregation method
    """

    results = pd.DataFrame(
        columns=["itrial", "estimator", "method", "coverage", "width", "m"],
        index=np.arange(trials * (2 * boostrap_size + 1)),
    )

    (X, y) = get_X_y()
    m_vals = np.round(train_size *
                      np.linspace(0.1, 1, num=boostrap_size)).astype(int)

    result_index = 0
    for itrial in range(trials):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_size, random_state=random_state + itrial)
        PIs = PIs = compute_PIs(
            estimator=model,
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            method="plus",
            cv=-1,
            alpha=alpha,
            agg_function=agg_function,
        )
        (coverage, width) = get_coverage_width(PIs, y_test)
        results.iloc[result_index, :] = [
            itrial,
            type(model).__name__,
            "J+",
            coverage,
            width,
            0,
        ]
        result_index += 1

        for i_m, m in enumerate(m_vals):
            # J+aB, random B
            B_random = B_random_from_B_fixed(B_fixed,
                                             train_size,
                                             m,
                                             itrial=i_m)
            subsample_B_random = Subsample(
                n_resamplings=B_random,
                n_samples=m,
                replace=True,
                random_state=random_state,
            )
            PIs = compute_PIs(
                estimator=model,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                method="plus",
                cv=subsample_B_random,
                alpha=alpha,
                agg_function=agg_function,
            )
            (coverage, width) = get_coverage_width(PIs, y_test)
            results.iloc[result_index, :] = [
                itrial,
                type(model).__name__,
                "J+aB Random B",
                coverage,
                width,
                m,
            ]
            result_index += 1

            # J+aB, fixed B
            subsample_B_fixed = Subsample(
                n_resamplings=B_fixed,
                n_samples=m,
                replace=True,
                random_state=random_state,
            )
            PIs = PIs = compute_PIs(
                estimator=model,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                method="plus",
                cv=subsample_B_fixed,
                alpha=alpha,
                agg_function=agg_function,
            )
            (coverage, width) = get_coverage_width(PIs, y_test)
            results.iloc[result_index, :] = [
                itrial,
                type(model).__name__,
                "J+aB Fixed B",
                coverage,
                width,
                m,
            ]
            result_index += 1
    results["agg_function"] = agg_function
    results["alpha"] = alpha
    results = results.astype({
        "itrial": int,
        "estimator": str,
        "method": str,
        "coverage": float,
        "width": float,
        "m": int,
        "agg_function": str,
    })
    return results
Esempio n. 4
0
def test_not_enough_resamplings() -> None:
    """Test that a warning is raised if at least one residual is nan."""
    with pytest.warns(UserWarning, match=r"WARNING: at least one point of*"):
        mapie_reg = MapieRegressor(cv=Subsample(n_resamplings=1),
                                   agg_function="mean")
        mapie_reg.fit(X, y)
Esempio n. 5
0
 Params(
     method="plus",
     agg_function="mean",
     cv=KFold(n_splits=3, shuffle=True, random_state=1),
 ),
 "cv_minmax":
 Params(
     method="minmax",
     agg_function="mean",
     cv=KFold(n_splits=3, shuffle=True, random_state=1),
 ),
 "jackknife_plus_ab":
 Params(
     method="plus",
     agg_function="mean",
     cv=Subsample(n_resamplings=30, random_state=1),
 ),
 "jackknife_minmax_ab":
 Params(
     method="minmax",
     agg_function="mean",
     cv=Subsample(n_resamplings=30, random_state=1),
 ),
 "jackknife_plus_median_ab":
 Params(
     method="plus",
     agg_function="median",
     cv=Subsample(
         n_resamplings=30,
         random_state=1,
     ),
Esempio n. 6
0
def test_default_parameters() -> None:
    """Test default values of Subsample."""
    cv = Subsample()
    assert cv.n_resamplings == 30
    assert cv.n_samples is None
    assert cv.random_state is None
Esempio n. 7
0
def test_get_n_splits() -> None:
    """Test get_n_splits method of Subsample."""
    cv = Subsample(n_resamplings=3)
    assert cv.get_n_splits() == 3