def test_baseline_argmax(): psx = np.array([ [0.9, 0.1, 0], [0.6, 0.2, 0.2], [0.3, 0.3, 4], [0.1, 0.1, 0.8], [0.4, 0.5, 0.1], ]) s = np.array([0,0,1,1,2]) label_errors = baseline_methods.baseline_argmax(psx, s) assert(all(label_errors == [False, False, True, True, True])) label_errors = baseline_methods.baseline_argmax(psx_, s_) assert(all(label_errors == np.array([False, False, True, False, False, False, False, False, False, False])))
# if f1 > best_f1: # print(prune_method) # best_f1 = f1 # cl_opt = label_errs results.append({ 'noise_amount_acc': acc, 'noise_amount': noise_amount, 'frac_zero_noise_rates': frac_zero_noise_rates, 'argmax': confusion_matrix( y_true=true_label_errors, y_pred=baseline_methods.baseline_argmax(psx, s), ), 'argmax_cm': confusion_matrix( y_true=true_label_errors, y_pred=baseline_methods.baseline_argmax_confusion_matrix(psx, s), ), 'argmax_ccm': confusion_matrix( y_true=true_label_errors, y_pred=baseline_methods. baseline_argmax_calibrated_confusion_matrix(psx, s), ), 'conf_joint_only': confusion_matrix( y_true=true_label_errors,
label_errors_bool_pbc = cleanlab.pruning.get_noise_indices( s=labels, psx=psx, prune_method='prune_by_class', sorted_index_method=None, ) label_errors_bool_pbnr = cleanlab.pruning.get_noise_indices( s=labels, psx=psx, prune_method='prune_by_noise_rate', sorted_index_method=None, ) label_errors_bool_argmax = baseline_methods.baseline_argmax(psx, labels) # In[7]: le_idx_both = cleanlab.pruning.order_label_errors(label_errors_bool_both, psx, labels) le_idx_pbc = cleanlab.pruning.order_label_errors(label_errors_bool_pbc, psx, labels) le_idx_pbnr = cleanlab.pruning.order_label_errors(label_errors_bool_pbnr, psx, labels) le_idx_argmax = cleanlab.pruning.order_label_errors(label_errors_bool_argmax, psx, labels) le_idx_cj_only = cleanlab.pruning.order_label_errors(label_errors_bool_cj_only, psx, labels) # In[9]:
def main(): folders = [c for c in os.listdir(base_dir) if 'noise_amount' in c] results = [] for folder in sorted(folders): print(folder) psx_file = [z for z in os.listdir(base_dir + folder) if 'pyx' in z][0] psx = np.load(base_dir + folder + "/" + psx_file) # Make sure psx is the right shape psx = psx[:, :10] # Load noisy labels frac_zero_noise_rates = folder.split('_')[-7] noise_amount = folder.split('_')[-1] base_rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0' rfn = base_rfn + '.{}__noise_amount__0.{}.json'.format( frac_zero_noise_rates, noise_amount) with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf: d = json.load(rf) s = np.asarray([v for k, v in d.items()]) true_label_errors = s != y acc = np.sum(s == y) / len(y) print('accuracy of labels:', acc) # Benchmark methods to find label errors using using confident learning. # psx is the n x m matrix of cross-validated predicted probabilities # s is the array of given noisy labels # Method: C_{\tilde{y}, y^*} label_error_mask = np.zeros(len(s), dtype=bool) label_error_indices = compute_confident_joint( s, psx, return_indices_of_off_diagonals=True)[1] for idx in label_error_indices: label_error_mask[idx] = True baseline_conf_joint_only = label_error_mask # Method: C_confusion baseline_argmax = baseline_methods.baseline_argmax(psx, s) # Method: CL: PBC baseline_cl_pbc = cleanlab.pruning.get_noise_indices( s, psx, prune_method='prune_by_class') # Method: CL: PBNR baseline_cl_pbnr = cleanlab.pruning.get_noise_indices( s, psx, prune_method='prune_by_noise_rate') # Method: CL: C+NR baseline_cl_both = cleanlab.pruning.get_noise_indices( s, psx, prune_method='both') # Create folders and store clean label np.array bool masks for training. clean_labels = { 'conf_joint_only': ~baseline_conf_joint_only, 'pruned_argmax': ~baseline_argmax, 'cl_pbc': ~baseline_cl_pbc, 'cl_pbnr': ~baseline_cl_pbnr, 'cl_both': ~baseline_cl_both, } for name, labels in clean_labels.items(): new_folder = base_dir + folder + "/train_pruned_" + name + "/" try: os.mkdir(new_folder) except FileExistsError: pass np.save(new_folder + "train_mask.npy", labels) print()