Example #1
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifer (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling. 

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 
    
        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the 
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(
                inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(
                    t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the noise matrix you provde."
                warnings.warn(w)
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(
                    self.ps, self.noise_matrix)
        if inverse_noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(inverse_noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' inv noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the inverse noise matrix you provde."
                warnings.warn(w)
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps, self.inverse_noise_matrix)
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba(
                    X=X,
                    s=s,
                    clf=self.clf,
                    cv_n_folds=self.cv_n_folds,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                    seed=self.seed,
                )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities(
                    s=s,
                    psx=psx,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None:  # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix, class_without_noise=self.pulearning)
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            prune_count_method=self.prune_count_method,
            converge_latent_estimates=self.converge_latent_estimates,
        )

        X_mask = ~self.noise_mask
        X_pruned = X[X_mask]
        s_pruned = s[X_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(
                inspect, 'getfullargspec'
        ) and 'sample_weight' in inspect.getfullargspec(
                self.clf.fit).args or hasattr(
                    inspect,
                    'getargspec') and 'sample_weight' in inspect.getargspec(
                        self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                self.sample_weight[s_pruned ==
                                   k] = 1.0 / self.noise_matrix[k][k]

            self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but its all we can do if sample_weight isn't available.
            self.clf.fit(X_pruned, s_pruned)

        return self.clf
Example #2
0
    def fit(
            self,
            X,
            s,
            psx=None,
            thresholds=None,
            noise_matrix=None,
            inverse_noise_matrix=None,
    ):
        """This method implements the confident learning. It counts examples
        that are likely labeled correctly and incorrectly and uses their ratio
        to create a predicted confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for
        the noise in both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling.

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
          examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx
          should have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). List of probabilities used to determine the cutoff
          predicted probability necessary to consider an example as a given
          class label.
          Default is None. These are computed for you automatically.
          If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. Values in list should be between 0 and 1.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1.

        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)"""

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and (
                np.trace(inverse_noise_matrix) <= 1
        ):
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (mislabeling) for all classes. 
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = (
                    compute_py_inv_noise_matrix(self.ps, self.noise_matrix))
        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps,
                    self.inverse_noise_matrix,
                )
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                    self.confident_joint, psx = \
                    estimate_py_noise_matrices_and_cv_pred_proba(
                        X=X,
                        s=s,
                        clf=self.clf,
                        cv_n_folds=self.cv_n_folds,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                        seed=self.seed,
                    )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                    self.confident_joint = \
                    estimate_py_and_noise_matrices_from_probabilities(
                        s=s,
                        psx=psx,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                    )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # if pulearning == the integer specifying the class without noise.
        if self.K == 2 and self.pulearning is not None:  # pragma: no cover
            # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0
            self.noise_matrix[self.pulearning][1 - self.pulearning] = 0
            self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0
            self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0
            self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0
            self.confident_joint[self.pulearning][1 - self.pulearning] = 0
            self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            n_jobs=self.n_jobs,
        ) 

        x_mask = ~self.noise_mask
        x_pruned = X[x_mask]
        s_pruned = s[x_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(inspect, 'getfullargspec') and \
                'sample_weight' in inspect.getfullargspec(self.clf.fit).args \
                or hasattr(inspect, 'getargspec') and \
                'sample_weight' in inspect.getargspec(self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                sample_weight_k = 1.0 / self.noise_matrix[k][k]
                self.sample_weight[s_pruned == k] = sample_weight_k

            self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but best we can do if no sample_weight.
            self.clf.fit(x_pruned, s_pruned)
            
        return self.clf
Example #3
0
def converge_estimates(
    ps,
    py,
    noise_matrix,
    inverse_noise_matrix,
    inv_noise_matrix_iterations=5,
    noise_matrix_iterations=3,
):
    '''Computes py := P(y=k) and both noise_matrix and inverse_noise_matrix,
    by numerically converging ps := P(s=k), py, and the noise matrices.

    Forces numerical consistency of estimates. Each is estimated
    independently, but they are related mathematically with closed form
    equivalences. This will iteratively make them mathematically consistent.

    py := P(y=k) and the inverse noise matrix P(y=k_y|s=k_s) specify one another,
    meaning one can be computed from the other and vice versa. When numerical
    discrepancy exists due to poor estimation, they can be made to agree by repeatedly
    computing one from the other, for some a certain number of iterations (3-10 works fine.)

    Do not set iterations too high or performance will decrease as small deviations
    will get perturbated over and over and potentially magnified.

    Note that we have to first converge the inverse_noise_matrix and py,
    then we can update the noise_matrix, then repeat. This is becauase the
    inverse noise matrix depends on py (which is unknown/latent), but the
    noise matrix depends on ps (which is known), so there will be no change
    in the noise matrix if we recompute it when py and inverse_noise_matrix change.


    Parameters
    ----------

    ps : np.array (shape (K, ) or (1, K))
        The fraction (prior probability) of each observed, noisy class label, P(y = k).

    noise_matrix : np.array of shape (K, K), K = number of classes
        A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
        the fraction of examples in every class, labeled as every other class.
        Assumes columns of noise_matrix sum to 1.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes
        A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
        the estimated fraction observed examples in each class k_s, that are
        mislabeled examples from every other class k_y. If None, the
        inverse_noise_matrix will be computed from psx and s.
        Assumes columns of inverse_noise_matrix sum to 1.

    Returns
    ------
        Three np.arrays of the form (py, noise_matrix, inverse_noise_matrix) with py
        and inverse_noise_matrix and noise_matrix having numerical agreement.'''

    for j in range(noise_matrix_iterations):
        for i in range(inv_noise_matrix_iterations):
            inverse_noise_matrix = compute_inv_noise_matrix(
                py, noise_matrix, ps)
            py = compute_py(ps, noise_matrix, inverse_noise_matrix)
        noise_matrix = compute_noise_matrix_from_inverse(
            ps, inverse_noise_matrix, py)

    return py, noise_matrix, inverse_noise_matrix
Example #4
0
def test_latent_nm():
    ps, py, inv = test_latent_py_ps_inv()
    nm2 = latent_algebra.compute_noise_matrix_from_inverse(ps, inv, py)
    assert(np.all(abs(nm - nm2) < 1e-3))