コード例 #1
0
def test_calibrate_joint():
    cj = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
        calibrate=False,
    )
    calibrated_cj = latent_estimation.calibrate_confident_joint(
        s=data["s"],
        confident_joint=cj,
    )
    s_counts = np.bincount(data["s"])

    # Check calibration
    assert (all(calibrated_cj.sum(axis=1).round().astype(int) == s_counts))
    assert (len(data["s"]) == int(round(np.sum(calibrated_cj))))

    calibrated_cj2 = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
        calibrate=True,
    )

    # Check equivalency
    assert (np.all(calibrated_cj == calibrated_cj2))
コード例 #2
0
def baseline_argmax_confusion_matrix(
    psx,
    s,
    calibrate=False,
    prune_method='prune_by_noise_rate',
):
    '''This is a baseline approach. That uses the a confusion matrix
    of argmax(psx) and s as the confident joint and then uses cleanlab
    (confident learning) to find the label errors using this matrix.

    Parameters
    ----------

    s : np.array
        A discrete vector of noisy labels, i.e. some labels may be erroneous.

    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the
        N examples x. This is the probability distribution over all K classes,
        for each example, regarding whether the example has label s==k P(s=k|x).
        psx should have been computed using 3 (or higher) fold cross-validation.

    Returns
    -------
        A boolean mask that is true if the example belong
        to that index is label error..'''

    confident_joint = confusion_matrix(np.argmax(psx, axis=1), s).T
    if calibrate:
        confident_joint = calibrate_confident_joint(confident_joint, s)
    return get_noise_indices(
        s=s,
        psx=psx,
        confident_joint=confident_joint,
        prune_method=prune_method,
    )