Esempio n. 1
0
def test_baseline_argmax():
    psx = np.array([
        [0.9, 0.1, 0],
        [0.6, 0.2, 0.2],
        [0.3, 0.3, 4],
        [0.1, 0.1, 0.8],
        [0.4, 0.5, 0.1],
    ])
    s = np.array([0,0,1,1,2])
    label_errors = baseline_methods.baseline_argmax(psx, s)
    assert(all(label_errors == [False, False, True, True, True]))
    
    label_errors = baseline_methods.baseline_argmax(psx_, s_)
    assert(all(label_errors == np.array([False, False, True, False, 
        False, False, False, False, False, False])))
Esempio n. 2
0
    #         if f1 > best_f1:
    #             print(prune_method)
    #             best_f1 = f1
    #             cl_opt = label_errs

    results.append({
        'noise_amount_acc':
        acc,
        'noise_amount':
        noise_amount,
        'frac_zero_noise_rates':
        frac_zero_noise_rates,
        'argmax':
        confusion_matrix(
            y_true=true_label_errors,
            y_pred=baseline_methods.baseline_argmax(psx, s),
        ),
        'argmax_cm':
        confusion_matrix(
            y_true=true_label_errors,
            y_pred=baseline_methods.baseline_argmax_confusion_matrix(psx, s),
        ),
        'argmax_ccm':
        confusion_matrix(
            y_true=true_label_errors,
            y_pred=baseline_methods.
            baseline_argmax_calibrated_confusion_matrix(psx, s),
        ),
        'conf_joint_only':
        confusion_matrix(
            y_true=true_label_errors,
Esempio n. 3
0
label_errors_bool_pbc = cleanlab.pruning.get_noise_indices(
    s=labels,
    psx=psx,
    prune_method='prune_by_class',
    sorted_index_method=None,
)

label_errors_bool_pbnr = cleanlab.pruning.get_noise_indices(
    s=labels,
    psx=psx,
    prune_method='prune_by_noise_rate',
    sorted_index_method=None,
)

label_errors_bool_argmax = baseline_methods.baseline_argmax(psx, labels)

# In[7]:

le_idx_both = cleanlab.pruning.order_label_errors(label_errors_bool_both, psx,
                                                  labels)
le_idx_pbc = cleanlab.pruning.order_label_errors(label_errors_bool_pbc, psx,
                                                 labels)
le_idx_pbnr = cleanlab.pruning.order_label_errors(label_errors_bool_pbnr, psx,
                                                  labels)
le_idx_argmax = cleanlab.pruning.order_label_errors(label_errors_bool_argmax,
                                                    psx, labels)
le_idx_cj_only = cleanlab.pruning.order_label_errors(label_errors_bool_cj_only,
                                                     psx, labels)

# In[9]:
Esempio n. 4
0
def main():

    folders = [c for c in os.listdir(base_dir) if 'noise_amount' in c]
    results = []
    for folder in sorted(folders):
        print(folder)
        psx_file = [z for z in os.listdir(base_dir + folder) if 'pyx' in z][0]
        psx = np.load(base_dir + folder + "/" + psx_file)

        # Make sure psx is the right shape
        psx = psx[:, :10]

        # Load noisy labels
        frac_zero_noise_rates = folder.split('_')[-7]
        noise_amount = folder.split('_')[-1]
        base_rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0'
        rfn = base_rfn + '.{}__noise_amount__0.{}.json'.format(
            frac_zero_noise_rates, noise_amount)
        with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf:
            d = json.load(rf)
        s = np.asarray([v for k, v in d.items()])

        true_label_errors = s != y
        acc = np.sum(s == y) / len(y)
        print('accuracy of labels:', acc)

        # Benchmark methods to find label errors using using confident learning.
        # psx is the n x m matrix of cross-validated predicted probabilities
        # s is the array of given noisy labels

        # Method: C_{\tilde{y}, y^*}
        label_error_mask = np.zeros(len(s), dtype=bool)
        label_error_indices = compute_confident_joint(
            s, psx, return_indices_of_off_diagonals=True)[1]
        for idx in label_error_indices:
            label_error_mask[idx] = True
        baseline_conf_joint_only = label_error_mask

        # Method: C_confusion
        baseline_argmax = baseline_methods.baseline_argmax(psx, s)

        # Method: CL: PBC
        baseline_cl_pbc = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='prune_by_class')

        # Method: CL: PBNR
        baseline_cl_pbnr = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='prune_by_noise_rate')

        # Method: CL: C+NR
        baseline_cl_both = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='both')

        # Create folders and store clean label np.array bool masks for training.
        clean_labels = {
            'conf_joint_only': ~baseline_conf_joint_only,
            'pruned_argmax': ~baseline_argmax,
            'cl_pbc': ~baseline_cl_pbc,
            'cl_pbnr': ~baseline_cl_pbnr,
            'cl_both': ~baseline_cl_both,
        }
        for name, labels in clean_labels.items():
            new_folder = base_dir + folder + "/train_pruned_" + name + "/"
            try:
                os.mkdir(new_folder)
            except FileExistsError:
                pass
            np.save(new_folder + "train_mask.npy", labels)
        print()