Ejemplo n.º 1
0
def cv_p_improvement_correct(base_classifier, x_train, y_train, x_test,
                             y_test, cv=2, score_type=None):
    bins = np.linspace(0, 1, 101)
    improv_counts = np.zeros(100)
    total_counts = np.zeros(100)

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    for i, (train, cali) in enumerate(skf.split(X=x_train, y=y_train)):
        if i < cv:
            x_t = x_train[train]
            y_t = y_train[train]
            x_c = x_train[cali]
            y_c = y_train[cali]
            classifier = clone(base_classifier)
            classifier.fit(x_t, y_t)
            ccv = calibrate(classifier, x_c, y_c, method="beta",
                            score_type=score_type)
            scores_c = ccv.base_estimator.predict_proba(x_c)
            scores = ccv.base_estimator.predict_proba(x_test)
            scores_beta = ccv.predict_proba(x_test)[:, 1]

            ll_before = cross_entropy(scores, y_test)
            ll_after = cross_entropy(scores_beta, y_test)

            test = beta_test(ccv.calibrator.calibrator_.map_,
                             test_type="adev", scores=scores_c)
            p_value = test["p-value"]
            idx = np.digitize(p_value, bins) - 1
            if idx == 100:
                idx = 99
            total_counts[idx] += 1
            if ll_after < ll_before:
                improv_counts[idx] += 1
    return {"improv": improv_counts, "total": total_counts}
Ejemplo n.º 2
0
def cv_p_improvement(base_classifier, x_train, y_train, x_test,
                            y_test, cv=2, score_type=None):
    p_values = np.array([])
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    for i, (train, cali) in enumerate(skf.split(X=x_train, y=y_train)):
        if i < cv:
            x_t = x_train[train]
            y_t = y_train[train]
            x_c = x_train[cali]
            y_c = y_train[cali]
            classifier = clone(base_classifier)
            classifier.fit(x_t, y_t)
            ccv = calibrate(classifier, x_c, y_c, method="beta",
                            score_type=score_type)
            scores = ccv.base_estimator.predict_proba(x_test)
            scores_beta = ccv.predict_proba(x_test)[:, 1]

            ll_before = cross_entropy(scores, y_test)
            ll_after = cross_entropy(scores_beta, y_test)

            if ll_after < ll_before:
                test = beta_test(ccv.calibrator.calibrator_.map_,
                                 test_type="adev", scores=scores)
                p_values = np.append(p_values, test["p-value"])
    return p_values
Ejemplo n.º 3
0
    def fit(self, X, y, sample_weight=None):
        """Fit beta calibration only if the classifier is considered 
        uncalibrated.

        Parameters
        ----------
        X : array-like, shape (n_samples,)
            Training data.

        y : array-like, shape (n_samples,)
            Training target.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        self._calibrator = BetaCalibration(parameters="abm").fit(X, y)
        test = beta_test(self._calibrator.calibrator_.map_,
                         test_type="adev",
                         scores=X)
        if test["p-value"] >= 0.05:
            self._calibrator = _DummyCalibration().fit(X, y)
        return self
Ejemplo n.º 4
0
def cv_confidence_intervals(base_classifier, x_train, y_train, x_test,
                            y_test, cv=2, score_type=None):
    intervals = None
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    for i, (train, cali) in enumerate(skf.split(X=x_train, y=y_train)):
        if i == 0:
            x_t = x_train[train]
            y_t = y_train[train]
            x_c = x_train[cali]
            y_c = y_train[cali]
            classifier = clone(base_classifier)
            classifier.fit(x_t, y_t)
            ccv = calibrate(classifier, x_c, y_c, method=None,
                            score_type=score_type)

            scores = ccv.predict_proba(x_c)[:, 1]
            scores_test = ccv.predict_proba(x_test)[:, 1]
            ll_before = cross_entropy(scores_test, y_test)
            brier_before = brier_score(scores_test, y_test)

            calibrator = BetaCalibration(parameters="abm").fit(scores, y_c)

            ll_after = cross_entropy(calibrator.predict(scores_test), y_test)
            brier_after = brier_score(calibrator.predict(scores_test), y_test)

            original_map = calibrator.calibrator_.map_
            intervals = beta_test(original_map,
                                  test_type="adev", scores=scores)
            intervals["ll_diff"] = ll_after - ll_before
            intervals["bs_diff"] = brier_after - brier_before
    return intervals
Ejemplo n.º 5
0
def cv_all_p(base_classifier,
             x_train,
             y_train,
             x_test,
             y_test,
             cv=2,
             score_type=None):
    p_values = np.zeros(cv)
    improvements = np.zeros(cv)
    p_values_dist = np.zeros(cv)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    for i, (train, cali) in enumerate(skf.split(X=x_train, y=y_train)):
        if i < cv:
            x_t = x_train[train]
            y_t = y_train[train]
            x_c = x_train[cali]
            y_c = y_train[cali]
            classifier = clone(base_classifier)
            classifier.fit(x_t, y_t)
            ccv = calibrate(classifier,
                            x_c,
                            y_c,
                            method="beta",
                            score_type=score_type)
            scores_c = ccv.base_estimator.predict_proba(x_c)
            scores = ccv.base_estimator.predict_proba(x_test)

            test = beta_test(ccv.calibrator.calibrator_.map_,
                             test_type="adev",
                             scores=scores_c)
            p_values[i] = test["p-value"]

            scores_beta = ccv.predict_proba(x_test)[:, 1]

            ll_before = cross_entropy(scores, y_test)
            ll_after = cross_entropy(scores_beta, y_test)
            improvements[i] = ll_after < ll_before

            # df_pos = scores_c[y_c == 1]
            # df_neg = scores_c[y_c == 0]
            # alpha_pos, beta_pos = fit_beta_moments(df_pos)
            # alpha_neg, beta_neg = fit_beta_moments(df_neg)
            # a = alpha_pos - alpha_neg
            # if np.isnan(a):
            #     a = 0
            # if a > 100:
            #     a = 100
            # b = beta_neg - beta_pos
            # if np.isnan(b):
            #     b = 0
            # if b > 100:
            #     b = 100
            # prior_pos = len(df_pos) / len(scores_c)
            # prior_neg = len(df_neg) / len(scores_c)
            # m = fit_beta_midpoint(prior_pos, alpha_pos, beta_pos, prior_neg,
            #                       alpha_neg, beta_neg)
            # map = [a, b, m]
            # test = beta_test(map, test_type="adev", scores=scores_c)
            # p_values_dist[i] = test["p-value"]
    return p_values, improvements, p_values_dist