Exemple #1
0
def test_cj_from_probs():
    with pytest.warns(UserWarning) as w:
        cj = latent_estimation.estimate_confident_joint_from_probabilities(
            s=data["s"],
            psx=data["psx"],
            force_ps=10,
        )
        true_ps = data["ps"] * data["n"]
        forced = cj.sum(axis=1)

        cj = latent_estimation.estimate_confident_joint_from_probabilities(
            s=data["s"],
            psx=data["psx"],
            force_ps=1,
        )
        forced1 = cj.sum(axis=1)

        cj = latent_estimation.estimate_confident_joint_from_probabilities(
            s=data["s"],
            psx=data["psx"],
            force_ps=False,
        )
        regular = cj.sum(axis=1)
        # Forcing ps should make ps more similar to the true ps.
        assert (np.mean(true_ps - forced) <= np.mean(true_ps - regular))
        # Check that one iteration is the same as not forcing ps
        assert (np.mean(true_ps - forced1) - np.mean(true_ps - regular) < 2e-4)
Exemple #2
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    converge_latent_estimates=False,
    sorted_index_method=None,
    multi_label=False,
):
    '''Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
      A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.

    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimatesa a confident
      subset of the joint disribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.

    frac_noise : float
      When frac_of_noise = 1.0, return all "confident" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k

    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.
      ***Only set this parameter if prune_method == 'prune_by_class'
      You may use with prune_method == 'prune_by_noise_rate', but
      if num_to_remove_per_class == k, then either k-1, k, or k+1
      examples may be removed for any class. This is because noise rates
      are floats, and rounding may cause a one-off. If you need exactly
      'k' examples removed from every class, you should use 'prune_by_class'.

    prune_method : str (default: 'prune_by_noise_rate')
      Posible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.

    converge_latent_estimates : bool (Default: False)
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively enforce mathematically consistency.

    sorted_index_method : str [None, 'prob_given_label', 'normalized_margin']
      If not None, returns an array of the label error indices
      (instead of a bool mask) where error indices are ordered by the either:
        'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))
        'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]

    multi_label : bool
      If true, s should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.'''

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([i for l in s for i in l])
    else:
        s_counts = value_counts(s)
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import (
            estimate_confident_joint_from_probabilities)
        confident_joint = estimate_confident_joint_from_probabilities(s, psx)

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = np.round(tmp).astype(int)

    # Peform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes

    if prune_method == 'prune_by_class' or prune_method == 'both':
        with multiprocessing_context(
                multiprocessing.cpu_count(),
                initializer=_multiprocessing_initialization,
                initargs=(s, s_counts, prune_count_matrix, psx, multi_label),
        ) as p:
            print('Parallel processing label errors by class.')
            sys.stdout.flush()
            if big_dataset and tqdm_exists:
                noise_masks_per_class = list(
                    tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K))
            else:
                noise_masks_per_class = p.map(_prune_by_class, range(K))
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        with multiprocessing_context(
                multiprocessing.cpu_count(),
                initializer=_multiprocessing_initialization,
                initargs=(s, s_counts, prune_count_matrix, psx, multi_label),
        ) as p:
            print('Parallel processing label errors by noise rate.')
            sys.stdout.flush()
            if big_dataset and tqdm_exists:
                noise_masks_per_class = list(
                    tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
            else:
                noise_masks_per_class = p.map(_prune_by_count, range(K))
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

    # Remove label errors if given label == model prediction
    if multi_label:
        pred = multiclass_crossval_predict(psx, s)
    else:
        pred = psx.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if label_errors_mask[i] and pred_label == s[i]:
            label_errors_mask[i] = False

    if sorted_index_method is not None:
        er = order_label_errors(label_errors_mask, psx, s, sorted_index_method)
        return er

    return label_errors_mask