def test_compute_confident_joint(): cj = latent_estimation.compute_confident_joint( s=data["s"], psx=data["psx"], ) # Check that confident joint doesn't overcount number of examples. assert (np.sum(cj) <= data["n"]) # Check that confident joint is correct shape assert (np.shape(cj) == (data["m"], data["m"]))
def test_confident_learning_baseline(): cj, indices = latent_estimation.compute_confident_joint( s=data["s"], psx=data["psx"], calibrate=False, return_indices_of_off_diagonals=True, ) # Check that the number of 'label errors' found in off diagonals # matches the off diagonals of the uncalibrated confident joint assert(len(indices) == (np.sum(cj) - np.trace(cj)))
def test_calibrate_joint(): cj = latent_estimation.compute_confident_joint( s=data["s"], psx=data["psx"], calibrate=False, ) calibrated_cj = latent_estimation.calibrate_confident_joint( s=data["s"], confident_joint=cj, ) s_counts = np.bincount(data["s"]) # Check calibration assert (all(calibrated_cj.sum(axis=1).round().astype(int) == s_counts)) assert (len(data["s"]) == int(round(np.sum(calibrated_cj)))) calibrated_cj2 = latent_estimation.compute_confident_joint( s=data["s"], psx=data["psx"], calibrate=True, ) # Check equivalency assert (np.all(calibrated_cj == calibrated_cj2))
def get_noise_indices( s, psx, inverse_noise_matrix=None, confident_joint=None, frac_noise=1.0, num_to_remove_per_class=None, prune_method='prune_by_noise_rate', sorted_index_method=None, multi_label=False, n_jobs=None, verbose=0, ): """Returns the indices of most likely (confident) label errors in s. The number of indices returned is specified by frac_of_noise. When frac_of_noise = 1.0, all "confident" estimated noise indices are returned. * If you encounter the error 'psx is not defined', try setting n_jobs = 1. Parameters ---------- s : np.array A binary vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \\tilde(y), for ASCII encoding reasons. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3+ fold cross-validation. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probability matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. confident_joint : np.array (shape (K, K), type int) (default: None) A K,K integer matrix of count(s=k, y=k). Estimates a a confident subset of the joint distribution of the noisy and true labels P_{s,y}. Each entry in the matrix contains the number of examples confidently counted into every pair (s=j, y=k) classes. frac_noise : float When frac_of_noise = 1.0, return all "confident" estimated noise indices. Value in range (0, 1] that determines the fraction of noisy example indices to return based on the following formula for example class k. frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k num_to_remove_per_class : list of int of length K (# of classes) e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return the indices of the 5 most likely mislabeled examples in class s = 0, and the most likely mislabeled example in class s = 1. Note ---- Only set this parameter if ``prune_method == 'prune_by_class'`` You may use with ``prune_method == 'prune_by_noise_rate'``, but if ``num_to_remove_per_class == k``, then either k-1, k, or k+1 examples may be removed for any class. This is because noise rates are floats, and rounding may cause a one-off. If you need exactly 'k' examples removed from every class, you should use ``'prune_by_class'`` prune_method : str (default: 'prune_by_noise_rate') Possible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'. Method used for pruning. 1. 'prune_by_noise_rate': works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py). 2. 'prune_by_class': works by removing the examples with *smallest probability* of belonging to their given class label for every class. 3. 'both': Finds the examples satisfying (1) AND (2) and removes their set conjunction. sorted_index_method : {:obj:`None`, :obj:`prob_given_label`, :obj:`normalized_margin`} If None, returns a boolean mask (true if example at index is label error) If not None, returns an array of the label error indices (instead of a bool mask) where error indices are ordered by the either: ``'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))`` ``'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]`` multi_label : bool If true, s should be an iterable (e.g. list) of iterables, containing a list of labels for each example, instead of just a single label. n_jobs : int (Windows users may see a speed-up with n_jobs = 1) Number of processing threads used by multiprocessing. Default None sets to the number of processing threads on your CPU. Set this to 1 to REMOVE parallel processing (if its causing issues). verbose : int If 0, no print statements. If 1, prints when multiprocessing happens.""" # Set-up number of multiprocessing threads if n_jobs is None: n_jobs = multiprocessing.cpu_count() else: assert (n_jobs >= 1) # Number of examples in each class of s if multi_label: s_counts = value_counts([i for lst in s for i in lst]) else: s_counts = value_counts(s) # Number of classes s K = len(psx.T) # Boolean set to true if dataset is large big_dataset = K * len(s) > 1e8 # Ensure labels are of type np.array() s = np.asarray(s) if confident_joint is None: from cleanlab.latent_estimation import compute_confident_joint confident_joint = compute_confident_joint( s=s, psx=psx, multi_label=multi_label, ) # Leave at least MIN_NUM_PER_CLASS examples per class. # NOTE prune_count_matrix is transposed (relative to confident_joint) prune_count_matrix = keep_at_least_n_per_class( prune_count_matrix=confident_joint.T, n=MIN_NUM_PER_CLASS, frac_noise=frac_noise, ) if num_to_remove_per_class is not None: # Estimate joint probability distribution over label errors psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1) noise_per_s = psy.sum(axis=1) - psy.diagonal() # Calibrate s.t. noise rates sum to num_to_remove_per_class tmp = (psy.T * num_to_remove_per_class / noise_per_s).T np.fill_diagonal(tmp, s_counts - num_to_remove_per_class) prune_count_matrix = round_preserving_row_totals(tmp) if n_jobs > 1: # Prepare multiprocessing shared data if multi_label: _s = RawArray('I', int2onehot(s).flatten()) else: _s = RawArray('I', s) _s_counts = RawArray('I', s_counts) _prune_count_matrix = RawArray('I', prune_count_matrix.flatten()) _psx = RawArray('f', psx.flatten()) else: # Multiprocessing is turned off. Create tuple with all parameters args = (s, s_counts, prune_count_matrix, psx, multi_label) # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n) # Operations are parallelized across all CPU processes if prune_method == 'prune_by_class' or prune_method == 'both': if n_jobs > 1: # parallelize with multiprocessing_context( n_jobs, initializer=_init, initargs=(_s, _s_counts, _prune_count_matrix, prune_count_matrix.shape, _psx, psx.shape, multi_label), ) as p: if verbose: print('Parallel processing label errors by class.') sys.stdout.flush() if big_dataset and tqdm_exists: noise_masks_per_class = list( tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K), ) else: noise_masks_per_class = p.map(_prune_by_class, range(K)) else: # n_jobs = 1, so no parallelization noise_masks_per_class = [ _prune_by_class(k, args) for k in range(K) ] label_errors_mask = np.stack(noise_masks_per_class).any(axis=0) if prune_method == 'both': label_errors_mask_by_class = label_errors_mask if prune_method == 'prune_by_noise_rate' or prune_method == 'both': if n_jobs > 1: # parallelize with multiprocessing_context( n_jobs, initializer=_init, initargs=(_s, _s_counts, _prune_count_matrix, prune_count_matrix.shape, _psx, psx.shape, multi_label), ) as p: if verbose: print('Parallel processing label errors by noise rate.') sys.stdout.flush() if big_dataset and tqdm_exists: noise_masks_per_class = list( tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K)) else: noise_masks_per_class = p.map(_prune_by_count, range(K)) else: # n_jobs = 1, so no parallelization noise_masks_per_class = [ _prune_by_count(k, args) for k in range(K) ] label_errors_mask = np.stack(noise_masks_per_class).any(axis=0) if prune_method == 'both': label_errors_mask = label_errors_mask & label_errors_mask_by_class # Remove label errors if given label == model prediction if multi_label: pred = multiclass_crossval_predict(psx, s) s = MultiLabelBinarizer().fit_transform(s) else: pred = psx.argmax(axis=1) for i, pred_label in enumerate(pred): if multi_label and np.all(pred_label == s[i]) or \ not multi_label and pred_label == s[i]: label_errors_mask[i] = False if sorted_index_method is not None: er = order_label_errors(label_errors_mask, psx, s, sorted_index_method) return er return label_errors_mask
if noise_amount == '8': continue rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0.{}__noise_amount__0.{}.json'.format( frac_zero_noise_rates, noise_amount) with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf: d = json.load(rf) s = np.asarray([v for k, v in d.items()]) true_label_errors = s != y acc = np.sum(s == y) / len(y) print('accuracy of labels:', acc) # Benchmarks label_error_mask = np.zeros(len(s), dtype=bool) label_error_indices = compute_confident_joint( s, psx, return_indices_of_off_diagonals=True)[1] for idx in label_error_indices: label_error_mask[idx] = True conf_joint_only = label_error_mask # # Confident learning optimized # best_f1 = -1 # cl_opt = None # for prune_method in ['prune_by_class', 'prune_by_noise_rate', 'both']: # label_errs = cleanlab.pruning.get_noise_indices( # s, # psx, # prune_method=prune_method, # ) # f1 = precision_recall_fscore_support( # y_true=true_label_errors,
def main(): folders = [c for c in os.listdir(base_dir) if 'noise_amount' in c] results = [] for folder in sorted(folders): print(folder) psx_file = [z for z in os.listdir(base_dir + folder) if 'pyx' in z][0] psx = np.load(base_dir + folder + "/" + psx_file) # Make sure psx is the right shape psx = psx[:, :10] # Load noisy labels frac_zero_noise_rates = folder.split('_')[-7] noise_amount = folder.split('_')[-1] base_rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0' rfn = base_rfn + '.{}__noise_amount__0.{}.json'.format( frac_zero_noise_rates, noise_amount) with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf: d = json.load(rf) s = np.asarray([v for k, v in d.items()]) true_label_errors = s != y acc = np.sum(s == y) / len(y) print('accuracy of labels:', acc) # Benchmark methods to find label errors using using confident learning. # psx is the n x m matrix of cross-validated predicted probabilities # s is the array of given noisy labels # Method: C_{\tilde{y}, y^*} label_error_mask = np.zeros(len(s), dtype=bool) label_error_indices = compute_confident_joint( s, psx, return_indices_of_off_diagonals=True)[1] for idx in label_error_indices: label_error_mask[idx] = True baseline_conf_joint_only = label_error_mask # Method: C_confusion baseline_argmax = baseline_methods.baseline_argmax(psx, s) # Method: CL: PBC baseline_cl_pbc = cleanlab.pruning.get_noise_indices( s, psx, prune_method='prune_by_class') # Method: CL: PBNR baseline_cl_pbnr = cleanlab.pruning.get_noise_indices( s, psx, prune_method='prune_by_noise_rate') # Method: CL: C+NR baseline_cl_both = cleanlab.pruning.get_noise_indices( s, psx, prune_method='both') # Create folders and store clean label np.array bool masks for training. clean_labels = { 'conf_joint_only': ~baseline_conf_joint_only, 'pruned_argmax': ~baseline_argmax, 'cl_pbc': ~baseline_cl_pbc, 'cl_pbnr': ~baseline_cl_pbnr, 'cl_both': ~baseline_cl_both, } for name, labels in clean_labels.items(): new_folder = base_dir + folder + "/train_pruned_" + name + "/" try: os.mkdir(new_folder) except FileExistsError: pass np.save(new_folder + "train_mask.npy", labels) print()