Beispiel #1
0
def test_compute_confident_joint():
    cj = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
    )

    # Check that confident joint doesn't overcount number of examples.
    assert (np.sum(cj) <= data["n"])
    # Check that confident joint is correct shape
    assert (np.shape(cj) == (data["m"], data["m"]))
def test_confident_learning_baseline():
    cj, indices = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
        calibrate=False,
        return_indices_of_off_diagonals=True,
    )
    # Check that the number of 'label errors' found in off diagonals
    # matches the off diagonals of the uncalibrated confident joint
    assert(len(indices) == (np.sum(cj) - np.trace(cj)))
Beispiel #3
0
def test_calibrate_joint():
    cj = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
        calibrate=False,
    )
    calibrated_cj = latent_estimation.calibrate_confident_joint(
        s=data["s"],
        confident_joint=cj,
    )
    s_counts = np.bincount(data["s"])

    # Check calibration
    assert (all(calibrated_cj.sum(axis=1).round().astype(int) == s_counts))
    assert (len(data["s"]) == int(round(np.sum(calibrated_cj))))

    calibrated_cj2 = latent_estimation.compute_confident_joint(
        s=data["s"],
        psx=data["psx"],
        calibrate=True,
    )

    # Check equivalency
    assert (np.all(calibrated_cj == calibrated_cj2))
Beispiel #4
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    sorted_index_method=None,
    multi_label=False,
    n_jobs=None,
    verbose=0,
):
    """Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.
    * If you encounter the error 'psx is not defined', try setting n_jobs = 1.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \\tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
      A conditional probability matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.

    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimates a a confident
      subset of the joint distribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.

    frac_noise : float
      When frac_of_noise = 1.0, return all "confident" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k

    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.

      Note
      ----
      Only set this parameter if ``prune_method == 'prune_by_class'``
      You may use with ``prune_method == 'prune_by_noise_rate'``, but
      if ``num_to_remove_per_class == k``, then either k-1, k, or k+1
      examples may be removed for any class. This is because noise rates
      are floats, and rounding may cause a one-off. If you need exactly
      'k' examples removed from every class, you should use ``'prune_by_class'``

    prune_method : str (default: 'prune_by_noise_rate')
      Possible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.

    sorted_index_method : {:obj:`None`, :obj:`prob_given_label`, :obj:`normalized_margin`}
      If None, returns a boolean mask (true if example at index is label error)
      If not None, returns an array of the label error indices
      (instead of a bool mask) where error indices are ordered by the either:
      ``'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))``
      ``'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]``

    multi_label : bool
      If true, s should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.

    n_jobs : int (Windows users may see a speed-up with n_jobs = 1)
      Number of processing threads used by multiprocessing. Default None
      sets to the number of processing threads on your CPU.
      Set this to 1 to REMOVE parallel processing (if its causing issues).

    verbose : int
      If 0, no print statements. If 1, prints when multiprocessing happens."""

    # Set-up number of multiprocessing threads
    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count()
    else:
        assert (n_jobs >= 1)

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([i for lst in s for i in lst])
    else:
        s_counts = value_counts(s)
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import compute_confident_joint
        confident_joint = compute_confident_joint(
            s=s,
            psx=psx,
            multi_label=multi_label,
        )

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = round_preserving_row_totals(tmp)

    if n_jobs > 1:  # Prepare multiprocessing shared data
        if multi_label:
            _s = RawArray('I', int2onehot(s).flatten())
        else:
            _s = RawArray('I', s)
        _s_counts = RawArray('I', s_counts)
        _prune_count_matrix = RawArray('I', prune_count_matrix.flatten())
        _psx = RawArray('f', psx.flatten())
    else:  # Multiprocessing is turned off. Create tuple with all parameters
        args = (s, s_counts, prune_count_matrix, psx, multi_label)

    # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes
    if prune_method == 'prune_by_class' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by class.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_class, range(K)),
                                  total=K), )
                else:
                    noise_masks_per_class = p.map(_prune_by_class, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_class(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by noise rate.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
                else:
                    noise_masks_per_class = p.map(_prune_by_count, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_count(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

    # Remove label errors if given label == model prediction
    if multi_label:
        pred = multiclass_crossval_predict(psx, s)
        s = MultiLabelBinarizer().fit_transform(s)
    else:
        pred = psx.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if multi_label and np.all(pred_label == s[i]) or \
                not multi_label and pred_label == s[i]:
            label_errors_mask[i] = False

    if sorted_index_method is not None:
        er = order_label_errors(label_errors_mask, psx, s, sorted_index_method)
        return er

    return label_errors_mask
Beispiel #5
0
    if noise_amount == '8':
        continue
    rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0.{}__noise_amount__0.{}.json'.format(
        frac_zero_noise_rates, noise_amount)
    with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf:
        d = json.load(rf)
    s = np.asarray([v for k, v in d.items()])

    true_label_errors = s != y
    acc = np.sum(s == y) / len(y)
    print('accuracy of labels:', acc)

    # Benchmarks

    label_error_mask = np.zeros(len(s), dtype=bool)
    label_error_indices = compute_confident_joint(
        s, psx, return_indices_of_off_diagonals=True)[1]
    for idx in label_error_indices:
        label_error_mask[idx] = True
    conf_joint_only = label_error_mask

    #     # Confident learning optimized
    #     best_f1 = -1
    #     cl_opt = None
    #     for prune_method in ['prune_by_class', 'prune_by_noise_rate', 'both']:
    #         label_errs = cleanlab.pruning.get_noise_indices(
    #             s,
    #             psx,
    #             prune_method=prune_method,
    #         )
    #         f1 = precision_recall_fscore_support(
    #             y_true=true_label_errors,
def main():

    folders = [c for c in os.listdir(base_dir) if 'noise_amount' in c]
    results = []
    for folder in sorted(folders):
        print(folder)
        psx_file = [z for z in os.listdir(base_dir + folder) if 'pyx' in z][0]
        psx = np.load(base_dir + folder + "/" + psx_file)

        # Make sure psx is the right shape
        psx = psx[:, :10]

        # Load noisy labels
        frac_zero_noise_rates = folder.split('_')[-7]
        noise_amount = folder.split('_')[-1]
        base_rfn = 'cifar10_noisy_labels__frac_zero_noise_rates__0'
        rfn = base_rfn + '.{}__noise_amount__0.{}.json'.format(
            frac_zero_noise_rates, noise_amount)
        with open(noisy_base_dir + "cifar10_noisy_labels/" + rfn, 'r') as rf:
            d = json.load(rf)
        s = np.asarray([v for k, v in d.items()])

        true_label_errors = s != y
        acc = np.sum(s == y) / len(y)
        print('accuracy of labels:', acc)

        # Benchmark methods to find label errors using using confident learning.
        # psx is the n x m matrix of cross-validated predicted probabilities
        # s is the array of given noisy labels

        # Method: C_{\tilde{y}, y^*}
        label_error_mask = np.zeros(len(s), dtype=bool)
        label_error_indices = compute_confident_joint(
            s, psx, return_indices_of_off_diagonals=True)[1]
        for idx in label_error_indices:
            label_error_mask[idx] = True
        baseline_conf_joint_only = label_error_mask

        # Method: C_confusion
        baseline_argmax = baseline_methods.baseline_argmax(psx, s)

        # Method: CL: PBC
        baseline_cl_pbc = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='prune_by_class')

        # Method: CL: PBNR
        baseline_cl_pbnr = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='prune_by_noise_rate')

        # Method: CL: C+NR
        baseline_cl_both = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='both')

        # Create folders and store clean label np.array bool masks for training.
        clean_labels = {
            'conf_joint_only': ~baseline_conf_joint_only,
            'pruned_argmax': ~baseline_argmax,
            'cl_pbc': ~baseline_cl_pbc,
            'cl_pbnr': ~baseline_cl_pbnr,
            'cl_both': ~baseline_cl_both,
        }
        for name, labels in clean_labels.items():
            new_folder = base_dir + folder + "/train_pruned_" + name + "/"
            try:
                os.mkdir(new_folder)
            except FileExistsError:
                pass
            np.save(new_folder + "train_mask.npy", labels)
        print()