def calibrate_confident_joint(confident_joint, s, multi_label=False):
    """Calibrates any confident joint estimate P(s=i, y=j) such that
    np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    In other words, this function forces the confident joint to have the
    true noisy prior p(s) (summed over columns for each row) and also
    forces the confident joint to add up to the total number of examples.

    This method makes the confident joint a valid counts estimate
    of the actual joint of noisy and true labels.

    Parameters
    ----------

    confident_joint : np.array (shape (K, K))
        A K,K integer matrix of count(s=k, y=k). Estimates a confident subset of
        the joint disribution of the noisy and true labels P_{s,y}.
        Each entry in the matrix contains the number of examples confidently
        counted into every pair (s=j, y=k) classes.

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s"
        denotes the noisy label instead of \tilde(y), for ASCII reasons.

    multi_label : bool
        If true, s should be an iterable (e.g. list) of iterables, containing a
        list of labels for each example, instead of just a single label.
        The MAJOR DIFFERENCE in how this is calibrated versus single_label,
        is the total number of errors considered is based on the number
        of labels, not the number of examples. So, the calibrated
        confident_joint will sum to the number of total labels.


    Returns
    -------
        An np.array of shape (K, K) of type float representing a valid
        estimate of the joint COUNTS of noisy and true labels.
    """

    if multi_label:
        s_counts = value_counts([x for lst in s for x in lst])
    else:
        s_counts = value_counts(s)
    # Calibrate confident joint to have correct p(s) prior on noisy labels.
    calibrated_cj = (
            confident_joint.T / confident_joint.sum(axis=1) * s_counts
    ).T
    # Calibrate confident joint to sum to:
    # The number of examples (for single labeled datasets)
    # The number of total labels (for multi-labeled datasets)
    calibrated_cj = calibrated_cj / np.sum(calibrated_cj) * sum(s_counts)
    return round_preserving_row_totals(calibrated_cj)
Example #2
0
def test_exact_prune_count():
    remove = 5
    s = data['s']
    noise_idx = pruning.get_noise_indices(s=s,
                                          psx=data['psx'],
                                          num_to_remove_per_class=remove)
    assert (all(value_counts(s[noise_idx]) == remove))
Example #3
0
def compute_ps_py_inv_noise_matrix(s, noise_matrix):
    '''Compute ps := P(s=k), py := P(y=k), and the inverse noise matrix.

    Parameters
    ----------

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s" denotes
        the noisy label instead of \tilde(y), for ASCII encoding reasons.

    noise_matrix : np.array of shape (K, K), K = number of classes 
        A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
        the fraction of examples in every class, labeled as every other class.
        Assumes columns of noise_matrix sum to 1.'''
  
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))

    py, inverse_noise_matrix = compute_py_inv_noise_matrix(ps, noise_matrix)
    return ps, py, inverse_noise_matrix
Example #4
0
    def fit(
        self, 
        X,
        s,
        psx = None,
        thresholds = None,
        noise_matrix = None,
        inverse_noise_matrix = None, 
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling.

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 

        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes. 
        # Also, if needed, compute P(s=k|x), denoted psx.
        
        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix)

        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix)

        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \
                estimate_py_noise_matrices_and_cv_pred_proba(
                    X = X,
                    s = s,
                    clf = self.clf,
                    cv_n_folds = self.cv_n_folds,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                    seed = self.seed,
                )
            else: # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \
                estimate_py_and_noise_matrices_from_probabilities(
                    s = s, 
                    psx = psx,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                )

        if psx is None: 
            psx = estimate_cv_predicted_probabilities(
                X = X,
                labels = s,
                clf = self.clf,
                cv_n_folds = self.cv_n_folds,
                seed = self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None: # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix,
                class_without_noise=self.pulearning,
            )
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix = self.inverse_noise_matrix,
            confident_joint = self.confident_joint,
            prune_method = self.prune_method,
        ) 
        if self.pulearning is not None:
            self.noise_mask[s != self.pulearning] = False
        return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
Example #5
0
    _ = ax.get_yaxis().set_ticks([])
    _ = plt.title("Iris dataset (feature 3 vs feature 1)", fontsize=30)
except Exception as e:
    print(e)
    print("Plotting is only supported in an iPython interface.")

# In[3]:

# Generate lots of noise.
noise_matrix = np.array([
    [0.5, 0.0, 0.0],
    [0.5, 1.0, 0.5],
    [0.0, 0.0, 0.5],
])

py = value_counts(y_train)
# Create noisy labels
s = generate_noisy_labels(y_train, noise_matrix)

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    _ = plt.figure(figsize=(15, 8))
    color_list = plt.cm.tab10(np.linspace(0, 1, 6))
    for k in range(len(np.unique(y_train))):
        X_k = X_train[y_train == k]  # data for class k
        _ = plt.scatter(
            X_k[:, 1],
            X_k[:, 3],
            color=[color_list[noisy_label] for noisy_label in s[y_train == k]],
Example #6
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        """This method implements the confident learning. It counts examples
        that are likely labeled correctly and incorrectly and uses their ratio
        to create a predicted confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for
        the noise in both the positive and negative sets.

        Parameters
        ----------
        X : :obj:`np.array`
          Input feature matrix (N, D), 2D numpy array

        s : :obj:`np.array`
          A binary vector of labels, s, which may contain mislabeling.

        psx : :obj:`np.array` (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
          examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx
          should have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : :obj:`iterable` (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). List of probabilities used to determine the cutoff
          predicted probability necessary to consider an example as a given
          class label.
          Default is ``None``. These are computed for you automatically.
          If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. Values in list should be between 0 and 1.

        noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1.

        inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Returns
        -------
        tuple
          (noise_mask, sample_weight)"""

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix)
                                                 <= 1):
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = (
                    compute_py_inv_noise_matrix(self.ps, self.noise_matrix))
        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps,
                    self.inverse_noise_matrix,
                )
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint, psx = \
                    estimate_py_noise_matrices_and_cv_pred_proba(
                        X=X,
                        s=s,
                        clf=self.clf,
                        cv_n_folds=self.cv_n_folds,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                        seed=self.seed,
                    )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint = \
                    estimate_py_and_noise_matrices_from_probabilities(
                        s=s,
                        psx=psx,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                    )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # if pulearning == the integer specifying the class without noise.
        if self.K == 2 and self.pulearning is not None:  # pragma: no cover
            # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0
            self.noise_matrix[self.pulearning][1 - self.pulearning] = 0
            self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0
            self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0
            self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0
            self.confident_joint[self.pulearning][1 - self.pulearning] = 0
            self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            n_jobs=self.n_jobs,
        )

        x_mask = ~self.noise_mask
        x_pruned = X[x_mask]
        s_pruned = s[x_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(inspect, 'getfullargspec') and \
                'sample_weight' in inspect.getfullargspec(self.clf.fit).args \
                or hasattr(inspect, 'getargspec') and \
                'sample_weight' in inspect.getargspec(self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                sample_weight_k = 1.0 / self.noise_matrix[k][k]
                self.sample_weight[s_pruned == k] = sample_weight_k

            self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but best we can do if no sample_weight.
            self.clf.fit(x_pruned, s_pruned)

        return self.clf
Example #7
0
def generate_noisy_labels(y, noise_matrix, verbose=False):  
    '''Generates noisy labels s (shape (N, 1)) from perfect labels y,
    'exactly' yielding the provided noise_matrix between s and y.
    
    Below we provide a for loop implementation of what this function does.
    We do not use this implementation as it is not a fast algorithm, but
    it explains as Python pseudocode what is happening in this function.    

    # Generate s
    count_joint = (noise_matrix * py * len(y)).round().astype(int)
    s = np.array(y)
    for k_s in range(K):
        for k_y in range(K):
            if k_s != k_y:
                idx_flip = np.where((s==k_y)&(y==k_y))[0]
                if len(idx_flip): # pragma: no cover
                    s[np.random.choice(
                        idx_flip, 
                        count_joint[k_s][k_y], 
                        replace=False,
                    )] = k_s

    Parameters
    ----------

    y : np.array (shape (N, 1))
        Perfect labels, without any noise. Contains K distinct natural number
        classes, e.g. 0, 1,..., K-1

    noise_matrix : np.array of shape (K, K), K = number of classes 
        A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
        the fraction of examples in every class, labeled as every other class.
        Assumes columns of noise_matrix sum to 1.'''
    
    # Make y a numpy array, if it is not
    y = np.asarray(y)
  
    # Number of classes
    K = len(noise_matrix)

    # Compute p(y=k)
    py = value_counts(y) / float(len(y))    
    
    # Counts of pairs (s, y)
    count_joint = (noise_matrix * py * len(y)).astype(int) 
    # Remove diagonal entries as they do not involve flipping of labels.
    np.fill_diagonal(count_joint, 0)
    
    # Generate s
    s = np.array(y)
    for k in range(K): # Iterate over true class y == k
        # Get the noisey s labels that have non-zero counts
        s_labels = np.where(count_joint[:, k] != 0)[0]
        # Find out how many of each noisy s label we need to flip to
        s_counts = count_joint[s_labels, k] 
        # Create a list of the new noisy labels
        noise = [s_labels[i] for i, c in enumerate(s_counts) for z in range(c)]
        # Randomly choose y labels for class k and set them to the noisy labels.
        idx_flip = np.where((s==k)&(y==k))[0]
        if len(idx_flip) and len(noise) and len(idx_flip) >= len(noise): # pragma: no cover
            s[np.random.choice(idx_flip, len(noise), replace=False)] = noise    

    # # Validate that s indeed produces the correct noise_matrix (or close to it)
    # # Compute the actual noise matrix induced by s
    # counts = confusion_matrix(s, y).astype(float)
    # new_noise_matrix = counts / counts.sum(axis=0)
    # assert(np.linalg.norm(noise_matrix - new_noise_matrix) <= 2)

    return s  
Example #8
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    sorted_index_method=None,
    multi_label=False,
    n_jobs=None,
    verbose=0,
):
    """Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.
    * If you encounter the error 'psx is not defined', try setting n_jobs = 1.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \\tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
      A conditional probability matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.

    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimates a a confident
      subset of the joint distribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.

    frac_noise : float
      When frac_of_noise = 1.0, return all "confident" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k

    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.

      Note
      ----
      Only set this parameter if ``prune_method == 'prune_by_class'``
      You may use with ``prune_method == 'prune_by_noise_rate'``, but
      if ``num_to_remove_per_class == k``, then either k-1, k, or k+1
      examples may be removed for any class. This is because noise rates
      are floats, and rounding may cause a one-off. If you need exactly
      'k' examples removed from every class, you should use ``'prune_by_class'``

    prune_method : str (default: 'prune_by_noise_rate')
      Possible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.

    sorted_index_method : {:obj:`None`, :obj:`prob_given_label`, :obj:`normalized_margin`}
      If None, returns a boolean mask (true if example at index is label error)
      If not None, returns an array of the label error indices
      (instead of a bool mask) where error indices are ordered by the either:
      ``'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))``
      ``'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]``

    multi_label : bool
      If true, s should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.

    n_jobs : int (Windows users may see a speed-up with n_jobs = 1)
      Number of processing threads used by multiprocessing. Default None
      sets to the number of processing threads on your CPU.
      Set this to 1 to REMOVE parallel processing (if its causing issues).

    verbose : int
      If 0, no print statements. If 1, prints when multiprocessing happens."""

    # Set-up number of multiprocessing threads
    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count()
    else:
        assert (n_jobs >= 1)

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([i for lst in s for i in lst])
    else:
        s_counts = value_counts(s)
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import compute_confident_joint
        confident_joint = compute_confident_joint(
            s=s,
            psx=psx,
            multi_label=multi_label,
        )

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = round_preserving_row_totals(tmp)

    if n_jobs > 1:  # Prepare multiprocessing shared data
        if multi_label:
            _s = RawArray('I', int2onehot(s).flatten())
        else:
            _s = RawArray('I', s)
        _s_counts = RawArray('I', s_counts)
        _prune_count_matrix = RawArray('I', prune_count_matrix.flatten())
        _psx = RawArray('f', psx.flatten())
    else:  # Multiprocessing is turned off. Create tuple with all parameters
        args = (s, s_counts, prune_count_matrix, psx, multi_label)

    # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes
    if prune_method == 'prune_by_class' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by class.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_class, range(K)),
                                  total=K), )
                else:
                    noise_masks_per_class = p.map(_prune_by_class, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_class(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by noise rate.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
                else:
                    noise_masks_per_class = p.map(_prune_by_count, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [
                _prune_by_count(k, args) for k in range(K)
            ]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

    # Remove label errors if given label == model prediction
    if multi_label:
        pred = multiclass_crossval_predict(psx, s)
        s = MultiLabelBinarizer().fit_transform(s)
    else:
        pred = psx.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if multi_label and np.all(pred_label == s[i]) or \
                not multi_label and pred_label == s[i]:
            label_errors_mask[i] = False

    if sorted_index_method is not None:
        er = order_label_errors(label_errors_mask, psx, s, sorted_index_method)
        return er

    return label_errors_mask
Example #9
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifer (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling. 

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 
    
        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the 
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(
                inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(
                    t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the noise matrix you provde."
                warnings.warn(w)
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(
                    self.ps, self.noise_matrix)
        if inverse_noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(inverse_noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' inv noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the inverse noise matrix you provde."
                warnings.warn(w)
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps, self.inverse_noise_matrix)
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba(
                    X=X,
                    s=s,
                    clf=self.clf,
                    cv_n_folds=self.cv_n_folds,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                    seed=self.seed,
                )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities(
                    s=s,
                    psx=psx,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None:  # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix, class_without_noise=self.pulearning)
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            prune_count_method=self.prune_count_method,
            converge_latent_estimates=self.converge_latent_estimates,
        )

        X_mask = ~self.noise_mask
        X_pruned = X[X_mask]
        s_pruned = s[X_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(
                inspect, 'getfullargspec'
        ) and 'sample_weight' in inspect.getfullargspec(
                self.clf.fit).args or hasattr(
                    inspect,
                    'getargspec') and 'sample_weight' in inspect.getargspec(
                        self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                self.sample_weight[s_pruned ==
                                   k] = 1.0 / self.noise_matrix[k][k]

            self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but its all we can do if sample_weight isn't available.
            self.clf.fit(X_pruned, s_pruned)

        return self.clf
Example #10
0
def estimate_confident_joint_from_probabilities(
    s,
    psx,
    thresholds=None,
    force_ps=False,
    return_list_of_converging_cj_matrices=False,
):
    '''DEPRECATED AS OF VERSION 0.0.8.
    REMOVED AS OF VERSION 0.0.10.
    
    Estimates P(s,y), the confident counts of the latent
    joint distribution of true and noisy labels
    using observed s and predicted probabilities psx.

    UNLIKE compute_confident_joint, this function calibrates
    the confident joint estimate P(s=i, y=j) such that
    np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    Parameters
    ----------

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s" denotes
        the noisy label instead of \tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
        P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
        This is the probability distribution over all K classes, for each
        example, regarding whether the example has label s==k P(s=k|x). psx should
        have been computed using 3 (or higher) fold cross-validation.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
        P(s^=k|s=k). If an example has a predicted probability "greater" than
        this threshold, it is counted as having hidden label y = k. This is
        not used for pruning, only for estimating the noise rates using
        confident counts. This value should be between 0 and 1. Default is None.

    force_ps : bool or int
        If true, forces the output confident_joint matrix to have p(s) closer to the true
        p(s). The method used is SGD with a learning rate of eta = 0.5.
        If force_ps is an integer, it represents the number of epochs.
        Setting this to True is not always good. To make p(s) match, fewer confident
        examples are used to estimate the confident_joint, resulting in poorer estimation of
        the overall matrix even if p(s) is more accurate.

    return_list_of_converging_cj_matrices : bool (default = False)
        When force_ps is true, it converges the joint count matrix that is returned.
        Setting this to true will return the list of the converged matrices. The first
        item in the list is the original and the last item is the final result.

    Output
    ------
        confident_joint matrix count(s, y) : np.array (shape (K, K))
        where np.sum(confident_joint) ~ len(s) and rows sum to np.bincount(s)'''

    w = '''WARNING! THIS METHOD IS DEPRICATED.
    USE compute_confident_joint INSTEAD.
    THIS METHOD WILL BE ~REMOVED~ in cleanlab version 0.0.10.'''
    warnings.warn(w)

    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))
    # Estimate the probability thresholds for confident counting
    s = np.asarray(s)
    if thresholds is None:
        thresholds = [np.mean(psx[:, k][s == k])
                      for k in range(K)]  # P(s^=k|s=k)
    thresholds = np.asarray(thresholds)
    # joint counts
    cjs = []
    sgd_epochs = 5 if force_ps is True else 1  # Default 5 epochs if force_ps
    if type(force_ps) == int:
        sgd_epochs = force_ps
    for sgd_iteration in range(sgd_epochs):  #  ONLY 1 iteration by default.
        # Compute the confident joint.
        confident_joint = compute_confident_joint(s, psx, K, thresholds)
        cjs.append(confident_joint)

        if force_ps:
            joint_ps = confident_joint.sum(axis=1) / float(
                np.sum(confident_joint))
            # Update thresholds (SGD) to converge p(s) of joint with actual p(s)
            eta = 0.5  # learning rate
            thresholds += eta * (joint_ps - ps)
        else:  # Do not converge p(s) of joint with actual p(s)
            break

    return cjs if return_list_of_converging_cj_matrices else confident_joint
Example #11
0
def estimate_confident_joint_and_cv_pred_proba(
    X,
    s,
    clf=logreg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    seed=None,
    calibrate=True,
):
    '''Estimates P(s,y), the confident counts of the latent
    joint distribution of true and noisy labels
    using observed s and predicted probabilities psx.

    The output of this function is a numpy array of shape (K, K).

    Under certain conditions, estimates are exact, and in many
    conditions, estimates are within one percent of actual.

    Notes: There are two ways to compute the confident joint with pros/cons.
    1. For each holdout set, we compute the confident joint, then sum them up.
    2. We get all the pred_proba, combine them, compute the confident joint on all.
    (1) is more accurate because it computes the appropriate thresholds for each fold
    (2) is more accurate when you have only a little data because it computes
    the confident joint using all the probabilities. For example if you had only 100
    examples, with 5-fold cross validation and uniform p(y) you would only have 20
    examples to compute each confident joint for (1). Such small amounts of data
    is bound to result in estimation errors. For this reason, we implement (2),
    but we implement (1) as a commented out function at the end of this file.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A discrete vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    seed : int (default = None)
        Number to set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.
        
    calibrate : bool (default: True)
        Calibrates confident joint estimate P(s=i, y=j) such that
        np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    Returns
    ------
      Returns a tuple of two numpy array matrices in the form:
      (joint counts matrix, predicted probability matrix)'''

    assert_inputs_are_valid(X, s)
    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))

    # Ensure labels are of type np.array()
    s = np.asarray(s)

    # Create cross-validation object for out-of-sample predicted probabilities.
    # CV folds preserve the fraction of noisy positive and
    # noisy negative examples in each class.
    kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=seed)

    # Intialize psx array
    psx = np.zeros((len(s), K))

    # Split X and s into "cv_n_folds" stratified folds.
    for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)):

        clf_copy = copy.deepcopy(clf)

        # Select the training and holdout cross-validated sets.
        X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx]
        s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx]

        # Fit the clf classifier to the training set and
        # predict on the holdout set and update psx.
        clf_copy.fit(X_train_cv, s_train_cv)
        psx_cv = clf_copy.predict_proba(X_holdout_cv)  # P(s = k|x) # [:,1]
        psx[cv_holdout_idx] = psx_cv

    # Compute the confident counts of all pairwise label-flipping mislabeling rates.
    confident_joint = compute_confident_joint(
        s=s,
        psx=psx,  # P(s = k|x)
        thresholds=thresholds,
        calibrate=calibrate,
    )

    return confident_joint, psx
Example #12
0
def estimate_latent(
    confident_joint,
    s,
    py_method='cnt',
    converge_latent_estimates=False,
):
    '''Computes the latent prior p(y), the noise matrix P(s|y) and the
    inverse noise matrix P(y|s) from the `confident_joint` count(s, y). The
    `confident_joint` estimated by `compute_confident_joint`
    by counting confident examples.

    Parameters
    ----------

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s" denotes
        the noisy label instead of \tilde(y), for ASCII encoding reasons.

    confident_joint : np.array (shape (K, K), type int)
        A K,K integer matrix of count(s=k, y=k). Estimatesa a confident subset of
        the joint disribution of the noisy and true labels P_{s,y}.
        Each entry in the matrix contains the number of examples confidently
        counted into every pair (s=j, y=k) classes.

    py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"])
        How to compute the latent prior p(y=k). Default is "cnt" as it often
        works well even when the noise matrices are estimated poorly by using
        the matrix diagonals instead of all the probabilities.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    Returns
    ------
        A tuple containing (py, noise_matrix, inv_noise_matrix).'''

    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))
    # Ensure labels are of type np.array()
    s = np.asarray(s)
    # Number of training examples confidently counted from each noisy class
    s_count = confident_joint.sum(axis=1).astype(float)
    # Number of training examples confidently counted into each true class
    y_count = confident_joint.sum(axis=0).astype(float)
    # Confident Counts Estimator for p(s=k_s|y=k_y) ~ |s=k_s and y=k_y| / |y=k_y|
    noise_matrix = confident_joint / y_count
    # Confident Counts Estimator for p(y=k_y|s=k_s) ~ |y=k_y and s=k_s| / |s=k_s|
    inv_noise_matrix = confident_joint.T / s_count
    # Compute the prior p(y), the latent (uncorrupted) class distribution.
    py = compute_py(ps, noise_matrix, inv_noise_matrix, py_method, y_count)
    # Clip noise rates to be valid probabilities.
    noise_matrix = clip_noise_rates(noise_matrix)
    inv_noise_matrix = clip_noise_rates(inv_noise_matrix)
    # Make latent estimates mathematically agree in their algebraic relations.
    if converge_latent_estimates:
        py, noise_matrix, inv_noise_matrix = converge_estimates(
            ps, py, noise_matrix, inv_noise_matrix)
        # Again clip py and noise rates into proper range [0,1)
        py = clip_values(py, low=1e-5, high=1.0, new_sum=1.0)
        noise_matrix = clip_noise_rates(noise_matrix)
        inv_noise_matrix = clip_noise_rates(inv_noise_matrix)

    return py, noise_matrix, inv_noise_matrix
Example #13
0
def test_value_counts_str():
    r = util.value_counts(['a', 'b', 'a'])
    assert (all(np.array([2, 1]) - r < 1e-4))
Example #14
0
def estimate_confident_joint_from_probabilities(
    s,
    psx,
    thresholds=None,
    force_ps=False,
    return_list_of_converging_cj_matrices=False,
):
    '''Estimates P(s,y), the confident counts of the latent
    joint distribution of true and noisy labels
    using observed s and predicted probabilities psx.

    UNLIKE compute_confident_joint, this function calibrates
    the confident joint estimate P(s=i, y=j) such that
    np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    In other words, this function forces the confident joint to have the
    true noisy prior p(s) (summed over columns for each row) and also
    forces the confident joint to add up to the total number of examples.
    This method RETURNS A VALID COUNT ESTIMATE
    of the actual joint of noisy and true labels.

    Important! This function assumes that psx are out-of-sample
    holdout probabilities. This can be done with cross validation. If
    the probabilities are not computed out-of-sample, overfitting may occur.

    This function estimates the joint of shape (K, K). This is the
    confident counts of examples in every class, labeled as every other class.

    Under certain conditions, estimates are exact, and in most
    conditions, the estimate is within 1 percent of the truth.

    We provide a for-loop based simplification of the confident joint
    below. This implementation is not efficient, not used in practice, and
    not complete, but covers the jist of how the confident joint is computed:

    # Confident examples are those that we are confident have label y = k
    # Estimate the (K, K) matrix of confident examples with s = k_s and y = k_y
    cj_ish = np.zeros((K, K))
    for k_s in range(K): # k_s is the class value k of noisy label s
        for k_y in range(K): # k_y is the (guessed) class value k of true label y
            cj_ish[k_s][k_y] = sum((psx[:,k_y] >= (thresholds[k_y] - 1e-8)) & (s == k_s))

    Parameters
    ----------

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s" denotes
        the noisy label instead of \tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
        P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
        This is the probability distribution over all K classes, for each
        example, regarding whether the example has label s==k P(s=k|x). psx should
        have been computed using 3 (or higher) fold cross-validation.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
        P(s^=k|s=k). If an example has a predicted probability "greater" than
        this threshold, it is counted as having hidden label y = k. This is
        not used for pruning, only for estimating the noise rates using
        confident counts. This value should be between 0 and 1. Default is None.

    force_ps : bool or int
        If true, forces the output confident_joint matrix to have p(s) closer to the true
        p(s). The method used is SGD with a learning rate of eta = 0.5.
        If force_ps is an integer, it represents the number of epochs.
        Setting this to True is not always good. To make p(s) match, fewer confident
        examples are used to estimate the confident_joint, resulting in poorer estimation of
        the overall matrix even if p(s) is more accurate.

    return_list_of_converging_cj_matrices : bool (default = False)
        When force_ps is true, it converges the joint count matrix that is returned.
        Setting this to true will return the list of the converged matrices. The first
        item in the list is the original and the last item is the final result.

    Output
    ------
        confident_joint matrix count(s, y) : np.array (shape (K, K))
        where np.sum(confident_joint) ~ len(s) and rows sum to np.bincount(s)'''

    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))
    # Estimate the probability thresholds for confident counting
    s = np.asarray(s)
    if thresholds is None:
        thresholds = [np.mean(psx[:, k][s == k])
                      for k in range(K)]  # P(s^=k|s=k)
    thresholds = np.asarray(thresholds)
    # joint counts
    cjs = []
    sgd_epochs = 5 if force_ps is True else 1  # Default 5 epochs if force_ps
    if type(force_ps) == int:
        sgd_epochs = force_ps
    for sgd_iteration in range(sgd_epochs):
        # Compute the confident joint.
        confident_joint = compute_confident_joint(s, psx, K, thresholds)
        confident_joint = calibrate_confident_joint(confident_joint, s, psx)
        cjs.append(confident_joint)

        if force_ps:
            joint_ps = confident_joint.sum(axis=1) / float(
                np.sum(confident_joint))
            # Update thresholds (SGD) to converge p(s) of joint with actual p(s)
            eta = 0.5  # learning rate
            thresholds += eta * (joint_ps - ps)
        else:  # Do not converge p(s) of joint with actual p(s)
            break

    return cjs if return_list_of_converging_cj_matrices else confident_joint
Example #15
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    converge_latent_estimates=False,
    sorted_index_method=None,
    multi_label=False,
):
    '''Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.

    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
      A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.

    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimatesa a confident
      subset of the joint disribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.

    frac_noise : float
      When frac_of_noise = 1.0, return all "confident" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k

    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.
      ***Only set this parameter if prune_method == 'prune_by_class'
      You may use with prune_method == 'prune_by_noise_rate', but
      if num_to_remove_per_class == k, then either k-1, k, or k+1
      examples may be removed for any class. This is because noise rates
      are floats, and rounding may cause a one-off. If you need exactly
      'k' examples removed from every class, you should use 'prune_by_class'.

    prune_method : str (default: 'prune_by_noise_rate')
      Posible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.

    converge_latent_estimates : bool (Default: False)
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively enforce mathematically consistency.

    sorted_index_method : str [None, 'prob_given_label', 'normalized_margin']
      If not None, returns an array of the label error indices
      (instead of a bool mask) where error indices are ordered by the either:
        'normalized_margin' := normalized margin (p(s = k) - max(p(s != k)))
        'prob_given_label' := [psx[i][labels[i]] for i in label_errors_idx]

    multi_label : bool
      If true, s should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.'''

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([i for l in s for i in l])
    else:
        s_counts = value_counts(s)
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import (
            estimate_confident_joint_from_probabilities)
        confident_joint = estimate_confident_joint_from_probabilities(s, psx)

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = np.round(tmp).astype(int)

    # Peform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes

    if prune_method == 'prune_by_class' or prune_method == 'both':
        with multiprocessing_context(
                multiprocessing.cpu_count(),
                initializer=_multiprocessing_initialization,
                initargs=(s, s_counts, prune_count_matrix, psx, multi_label),
        ) as p:
            print('Parallel processing label errors by class.')
            sys.stdout.flush()
            if big_dataset and tqdm_exists:
                noise_masks_per_class = list(
                    tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K))
            else:
                noise_masks_per_class = p.map(_prune_by_class, range(K))
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        with multiprocessing_context(
                multiprocessing.cpu_count(),
                initializer=_multiprocessing_initialization,
                initargs=(s, s_counts, prune_count_matrix, psx, multi_label),
        ) as p:
            print('Parallel processing label errors by noise rate.')
            sys.stdout.flush()
            if big_dataset and tqdm_exists:
                noise_masks_per_class = list(
                    tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
            else:
                noise_masks_per_class = p.map(_prune_by_count, range(K))
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

    # Remove label errors if given label == model prediction
    if multi_label:
        pred = multiclass_crossval_predict(psx, s)
    else:
        pred = psx.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if label_errors_mask[i] and pred_label == s[i]:
            label_errors_mask[i] = False

    if sorted_index_method is not None:
        er = order_label_errors(label_errors_mask, psx, s, sorted_index_method)
        return er

    return label_errors_mask
Example #16
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    prune_count_method='inverse_nm_dot_s',
    converge_latent_estimates=False,
    return_sorted_index=False,
    multi_label=False,
):
    '''Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When 
    frac_of_noise = 1.0, all "confidently" estimated noise indices are returned.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.
    
    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x). psx should
      have been computed using 3 (or higher) fold cross-validation.
      
    inverse_noise_matrix : np.array of shape (K, K), K = number of classes 
      A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the 
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.
        
    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimatesa a confident subset of
      the joint disribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.
  
    frac_noise : float
      When frac_of_noise = 1.0, return all "confidently" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example 
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently    
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k
      
    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return 
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.
      ***Only set this parameter if prune_method == 'prune_by_class'

    prune_method : str (default: 'prune_by_noise_rate')
      'prune_by_class', 'prune_by_noise_rate', or 'both'. Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with *high probability* of 
      being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest probability* of
      belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and removes their set conjunction. 

    prune_count_method : str (default 'inverse_nm_dot_s')
      Options are 'inverse_nm_dot_s' or 'calibrate_confident_joint'. 
        !DO NOT USE! 'calibrate_confident_joint' if you already know the noise matrix
      and will call .fit(noise_matrix = known_noise_matrix) or
      .fit(inverse_noise_matrix = known_inverse_noise_matrix) because
      'calibrate_confident_joint' will estimate the noise without using this information.
        !IN ALL OTHER CASES! We recommend always using 'calibrate_confident_joint'
      because it is faster and more robust when no noise matrix info is given.
        Determines the method used to estimate the counts of the joint P(s, y) that will 
      be used to determine how many examples to prune
      for every class that are flipped to every other class, as follows:
        if prune_count_method == 'inverse_nm_dot_s':
          prune_count_matrix = inverse_noise_matrix * s_counts # Matrix of counts(y=k and s=l)
        elif prune_count_method == 'calibrate_confident_joint':# calibrate
          prune_count_matrix = confident_joint.T / float(confident_joint.sum()) * len(s)

    converge_latent_estimates : bool (Default: False)
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form 
      equivalences. This will iteratively enforce mathematically consistency.
      
    return_sorted_index : bool
      If true, returns an array of the label error indices (instead of a bool mask)
      where error indices are ordered by the normalized margin (p(s = k) - max(p(s != k)))'''

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([l for l in s])
    else:
        s_counts = value_counts(s)
    # 'ps' is p(s=k)
    ps = s_counts / float(sum(s_counts))
    # Number of classes s
    K = len(psx.T)

    # Ensure labels are of type np.array()
    s = np.asarray(s)

    # Estimate the number of examples to confidently prune for each (s=j, y=k) pair.
    if (inverse_noise_matrix is None
            and prune_count_method == 'inverse_nm_dot_s') or (
                confident_joint is None
                and prune_count_method == 'calibrate_confident_joint'):
        from cleanlab.latent_estimation import estimate_py_and_noise_matrices_from_probabilities
        _, _, inverse_noise_matrix, confident_joint = estimate_py_and_noise_matrices_from_probabilities(
            s,
            psx,
            converge_latent_estimates=converge_latent_estimates,
        )
    if prune_count_method == 'inverse_nm_dot_s':
        prune_count_matrix = inverse_noise_matrix * s_counts  # Matrix of counts(y=k and s=l)
    elif prune_count_method == 'calibrate_confident_joint':
        prune_count_matrix = confident_joint.T / float(
            confident_joint.sum()) * len(s)  # calibrate
    else:
        raise ValueError(
            "prune_count_method should be 'inverse_nm_dot_s' or " +
            "'calibrate_confident_joint', but '" + prune_count_method +
            "' was given.")

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=prune_count_matrix,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = np.round(tmp).astype(int)

    # Initialize the boolean mask of noise indices.
    noise_mask = np.zeros(len(psx), dtype=bool)

    # Peform Pruning with threshold probabilities from BFPRT algorithm in O(n)

    if prune_method == 'prune_by_class' or prune_method == 'both':
        for k in range(K):
            if s_counts[
                    k] > MIN_NUM_PER_CLASS:  # Don't prune if not MIN_NUM_PER_CLASS
                num2prune = s_counts[k] - prune_count_matrix[k][k]
                # num2keep'th smallest probability of class k for examples with noisy label k
                s_filter = np.array([k in l
                                     for l in s]) if multi_label else s == k
                threshold = np.partition(psx[:, k][s_filter],
                                         num2prune)[num2prune]
                noise_mask = noise_mask | ((s_filter) &
                                           (psx[:, k] < threshold))

    if prune_method == 'both':
        noise_mask_by_class = noise_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        noise_mask = np.zeros(len(psx), dtype=bool)
        for k in range(K):  # true hidden label index
            if s_counts[
                    k] > MIN_NUM_PER_CLASS:  # Don't prune if not MIN_NUM_PER_CLASS
                for j in range(K):  # noisy label index
                    if k != j:  # Only prune for noise rates
                        num2prune = prune_count_matrix[k][j]
                        if num2prune > 0:
                            # num2prune'th largest p(class k) - p(class j) for x with noisy label j
                            margin = psx[:, k] - psx[:, j]
                            s_filter = np.array([j in l for l in s
                                                 ]) if multi_label else s == j
                            threshold = -np.partition(
                                -margin[s_filter],
                                num2prune - 1)[num2prune - 1]
                            noise_mask = noise_mask | ((s_filter) &
                                                       (margin >= threshold))

    noise_mask = noise_mask & noise_mask_by_class if prune_method == 'both' else noise_mask

    if return_sorted_index:
        return order_label_errors(noise_mask, psx, s)

    return noise_mask