예제 #1
0
def evaluate_l2ce(f, calibrator, z_dist, n):
    """Returns the calibration error of the calibrator on z_dist, f using n samples."""
    zs = z_dist(size=n)
    ps = f(zs)
    phats = calibrator.calibrate(zs)
    bins = cal.get_discrete_bins(phats)
    data = list(zip(phats, ps))
    binned_data = cal.bin(data, bins)
    return cal.plugin_ce(binned_data) ** 2
예제 #2
0
def eval_marginal_calibration(probs, probs, labels, plugin=True):
    ces = []  # Compute the calibration error per class, then take the average.
    k = probs.shape[1]
    labels_one_hot = cal.get_labels_one_hot(np.array(labels), k)
    for c in range(k):
        probs_c = probs[:, c]
        labels_c = labels_one_hot[:, c]
        data_c = list(zip(probs_c, labels_c))
        bins_c = cal.get_discrete_bins(probs_c)
        binned_data_c = cal.bin(data_c, bins_c)
        if plugin:
            ce_c = cal.plugin_ce(binned_data_c)**2
        else:
            ce_c = cal.unbiased_square_ce(binned_data_c)
        ces.append(ce_c)
    return np.mean(ces)
def main():
    # Make synthetic dataset.
    np.random.seed(0)  # Keep results consistent.
    num_points = 1000
    (zs, ys) = synthetic_data_1d(num_points=num_points)

    # Estimate a lower bound on the calibration error.
    # Here z_i is the confidence of the uncalibrated model, y_i is the true label.
    # In simple_example.py we used get_calibration_error, but for advanced users
    # we recommend using the more explicit lower_bound_scaling_ce to have
    # more control over functionality, and be explicit about the semantics -
    # that we are only estimating a lower bound.
    l2_calibration_error = calibration.lower_bound_scaling_ce(zs, ys)
    print("Uncalibrated model l2 calibration error is > %.2f%%" % (100 * l2_calibration_error))

    # We can break this down into multiple steps. 1. We choose a binning scheme,
    # 2. we bin the data, and 3. we measure the calibration error.
    # Each of these steps can be customized, and users can substitute the component
    # with their own code.
    data = list(zip(zs, ys))
    bins = calibration.get_equal_bins(zs, num_bins=10)
    l2_calibration_error = calibration.unbiased_l2_ce(calibration.bin(data, bins))
    print("Uncalibrated model l2 calibration error is > %.2f%%" % (100 * l2_calibration_error))

    # Use Platt binning to train a recalibrator.
    calibrator = calibration.PlattBinnerCalibrator(num_points, num_bins=10)
    calibrator.train_calibration(np.array(zs), ys)

    # Measure the calibration error of recalibrated model.
    # In this case we have a binning model, so we can estimate the true calibration error.
    # Again, for advanced users we recommend being explicit and using get_binning_ce instead
    # of get_calibration_error.
    (test_zs, test_ys) = synthetic_data_1d(num_points=num_points)
    calibrated_zs = list(calibrator.calibrate(test_zs))
    l2_calibration_error = calibration.get_binning_ce(calibrated_zs, test_ys)
    print("Scaling-binning l2 calibration error is %.2f%%" % (100 * l2_calibration_error))

    # As above we can break this down into 3 steps. Notice here we have a binning model,
    # so we use get_discrete_bins to get all the bins (all possible values the model
    # outputs).
    data = list(zip(calibrated_zs, test_ys))
    bins = calibration.get_discrete_bins(calibrated_zs)
    binned = calibration.bin(data, bins)
    l2_calibration_error = calibration.unbiased_l2_ce(calibration.bin(data, bins))
    print("Scaling-binning l2 calibration error is %.2f%%" % (100 * l2_calibration_error))

    # Compute calibration error and confidence interval.
    # In the simple_example.py we just called get_calibration_error_uncertainties.
    # This function uses the bootstrap to estimate confidence intervals.
    # The bootstrap first requires us to define the functional we are trying to
    # estimate, and then resamples the data multiple times to estimate confidence intervals.
    def estimate_ce(data, estimator):
        zs = [z for z, y in data]
        binned_data = calibration.bin(data, calibration.get_discrete_bins(zs))
        return estimator(binned_data)
    functional = lambda data: estimate_ce(data, lambda x: calibration.plugin_ce(x))
    [lower, _, upper] = calibration.bootstrap_uncertainty(data, functional, num_samples=100)
    print("  Confidence interval is [%.2f%%, %.2f%%]" % (100 * lower, 100 * upper))

    # Advanced: boostrap can be used to debias the l1-calibration error (ECE) as well.
    # This is a heuristic, which does not (yet) come with a formal guarantee.
    functional = lambda data: estimate_ce(data, lambda x: calibration.plugin_ce(x, power=1))
    [lower, mid, upper] = calibration.bootstrap_uncertainty(data, functional, num_samples=100)
    print("Debiased estimate of L1 calibration error is %.2f%%" % (100 * mid))
    print("  Confidence interval is [%.2f%%, %.2f%%]" % (100 * lower, 100 * upper))
 def estimator(data):
     binned_data = cal.bin(data, bins)
     return cal.plugin_ce(binned_data, power=lp)
 def plugin_estimator(p, l):
     data = list(zip(p, l))
     binned_data = cal.bin(data, bins)
     return cal.plugin_ce(binned_data, power=lp)
예제 #6
0
def eval_top_calibration(probs, probs, labels):
    correct = (cal.get_top_predictions(probs) == labels)
    data = list(zip(probs, correct))
    bins = cal.get_discrete_bins(probs)
    binned_data = cal.bin(data, bins)
    return cal.plugin_ce(binned_data)**2