Beispiel #1
0
def test_round_preserving_row_totals():
    mat = np.array([
        [1.7, 1.8, 1.5],
        [1.1, 1.4, 1.5],
        [1.3, 1.3, 1.4],
    ])
    mat_int = util.round_preserving_row_totals(mat)
    # Check that row sums are preserved
    assert(np.all(mat_int.sum(axis=1) == mat.sum(axis=1)))
def calibrate_confident_joint(confident_joint, s, multi_label=False):
    """Calibrates any confident joint estimate P(s=i, y=j) such that
    np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    In other words, this function forces the confident joint to have the
    true noisy prior p(s) (summed over columns for each row) and also
    forces the confident joint to add up to the total number of examples.

    This method makes the confident joint a valid counts estimate
    of the actual joint of noisy and true labels.

    Parameters
    ----------

    confident_joint : np.array (shape (K, K))
        A K,K integer matrix of count(s=k, y=k). Estimates a confident subset of
        the joint disribution of the noisy and true labels P_{s,y}.
        Each entry in the matrix contains the number of examples confidently
        counted into every pair (s=j, y=k) classes.

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s"
        denotes the noisy label instead of \tilde(y), for ASCII reasons.

    multi_label : bool
        If true, s should be an iterable (e.g. list) of iterables, containing a
        list of labels for each example, instead of just a single label.
        The MAJOR DIFFERENCE in how this is calibrated versus single_label,
        is the total number of errors considered is based on the number
        of labels, not the number of examples. So, the calibrated
        confident_joint will sum to the number of total labels.


    Returns
    -------
        An np.array of shape (K, K) of type float representing a valid
        estimate of the joint COUNTS of noisy and true labels.
    """

    if multi_label:
        s_counts = value_counts([x for lst in s for x in lst])
    else:
        s_counts = value_counts(s)
    # Calibrate confident joint to have correct p(s) prior on noisy labels.
    calibrated_cj = (
            confident_joint.T / confident_joint.sum(axis=1) * s_counts
    ).T
    # Calibrate confident joint to sum to:
    # The number of examples (for single labeled datasets)
    # The number of total labels (for multi-labeled datasets)
    calibrated_cj = calibrated_cj / np.sum(calibrated_cj) * sum(s_counts)
    return round_preserving_row_totals(calibrated_cj)
Beispiel #3
0
def keep_at_least_n_per_class(prune_count_matrix, n, frac_noise=1.0):
    """Make sure every class has at least n examples after removing noise.
    Functionally, increase each column, increases the diagonal term #(y=k,s=k)
    of prune_count_matrix until it is at least n, distributing the amount
    increased by subtracting uniformly from the rest of the terms in the
    column. When frac_of_noise = 1.0, return all "confidently" estimated
    noise indices, otherwise this returns frac_of_noise fraction of all
    the noise counts, with diagonal terms adjusted to ensure column
    totals are preserved.

    Parameters
    ----------

    prune_count_matrix : np.array of shape (K, K), K = number of classes
        A counts of mislabeled examples in every class. For this function.
        NOTE prune_count_matrix is transposed relative to confident_joint.

    n : int
        Number of examples to make sure are left in each class.

    frac_noise : float
        When frac_of_noise = 1.0, return all estimated noise indices.
        Value in range (0, 1] that determines the fraction of noisy example
        indices to return based on the following formula for example class k.
        frac_of_noise * number_of_mislabeled_examples_in_class_k, or
        frac_of_noise * inverse_noise_rate_class_k * num_examples_s_equal_k

    Returns
    -------

    prune_count_matrix : np.array of shape (K, K), K = number of classes
        Number of examples to remove from each class, for every other class."""

    prune_count_matrix_diagonal = np.diagonal(prune_count_matrix)

    # Set diagonal terms less than n, to n.
    new_diagonal = np.maximum(prune_count_matrix_diagonal, n)

    # Find how much diagonal terms were increased.
    diff_per_col = new_diagonal - prune_count_matrix_diagonal

    # Count non-zero, non-diagonal items per column
    # np.maximum(*, 1) makes this never 0 (we divide by this next)
    num_noise_rates_per_col = np.maximum(
        np.count_nonzero(prune_count_matrix, axis=0) - 1.,
        1.,
    )

    # Uniformly decrease non-zero noise rates by the same amount
    # that the diagonal items were increased
    new_mat = prune_count_matrix - diff_per_col / num_noise_rates_per_col

    # Originally zero noise rates will now be negative, fix them back to zero
    new_mat[new_mat < 0] = 0

    # Round diagonal terms (correctly labeled examples)
    np.fill_diagonal(new_mat, new_diagonal)

    # Reduce (multiply) all noise rates (non-diagonal) by frac_noise and
    # increase diagonal by the total amount reduced in each column
    # to preserve column counts.
    new_mat = reduce_prune_counts(new_mat, frac_noise)

    # These are counts, so return a matrix of ints.
    return round_preserving_row_totals(new_mat).astype(int)
Beispiel #4
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    sorted_index_method=None,
    multi_label=False,
    n_jobs=None,
    verbose=0,
):
    """Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.
    * If you encounter the error 'psx is not defined', try setting n_jobs = 1.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \\tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
      A conditional probability matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.

    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimates a a confident
      subset of the joint distribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.

    frac_noise : float
      When frac_of_noise = 1.0, return all "confident" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k

    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.

      Note
      ----
      Only set this parameter if ``prune_method == 'prune_by_class'``
      You may use with ``prune_method == 'prune_by_noise_rate'``, but
      if ``num_to_remove_per_class == k``, then either k-1, k, or k+1
      examples may be removed for any class. This is because noise rates
      are floats, and rounding may cause a one-off. If you need exactly
      'k' examples removed from every class, you should use ``'prune_by_class'``

    prune_method : str (default: 'prune_by_noise_rate')
      Possible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.

    sorted_index_method : {:obj:`None`, :obj:`prob_given_label`, :obj:`normalized_margin`}
      If None, returns a boolean mask (true if example at index is label error)
      If not None, returns an array of the label error indices
      (instead of a bool mask) where error indices are ordered by the either:
      ``'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))``
      ``'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]``

    multi_label : bool
      If true, s should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.

    n_jobs : int (Windows users may see a speed-up with n_jobs = 1)
      Number of processing threads used by multiprocessing. Default None
      sets to the number of processing threads on your CPU.
      Set this to 1 to REMOVE parallel processing (if its causing issues).

    verbose : int
      If 0, no print statements. If 1, prints when multiprocessing happens."""

    # Set-up number of multiprocessing threads
    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count()
    else:
        assert (n_jobs >= 1)

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([i for lst in s for i in lst])
    else:
        s_counts = value_counts(s)
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import compute_confident_joint
        confident_joint = compute_confident_joint(
            s=s,
            psx=psx,
            multi_label=multi_label,
        )

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = round_preserving_row_totals(tmp)

    if n_jobs > 1:  # Prepare multiprocessing shared data
        if multi_label:
            _s = RawArray('I', int2onehot(s).flatten())
        else:
            _s = RawArray('I', s)
        _s_counts = RawArray('I', s_counts)
        _prune_count_matrix = RawArray('I', prune_count_matrix.flatten())
        _psx = RawArray('f', psx.flatten())
    else:  # Multiprocessing is turned off. Create tuple with all parameters
        args = (s, s_counts, prune_count_matrix, psx, multi_label)

    # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes
    if prune_method == 'prune_by_class' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by class.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_class, range(K)),
                                  total=K), )
                else:
                    noise_masks_per_class = p.map(_prune_by_class, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_class(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by noise rate.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
                else:
                    noise_masks_per_class = p.map(_prune_by_count, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_count(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

    # Remove label errors if given label == model prediction
    if multi_label:
        pred = multiclass_crossval_predict(psx, s)
        s = MultiLabelBinarizer().fit_transform(s)
    else:
        pred = psx.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if multi_label and np.all(pred_label == s[i]) or \
                not multi_label and pred_label == s[i]:
            label_errors_mask[i] = False

    if sorted_index_method is not None:
        er = order_label_errors(label_errors_mask, psx, s, sorted_index_method)
        return er

    return label_errors_mask