def test_pu_remove_noise(): nm = np.array([ [0.9, 0.0, 0.0], [0.0, 0.7, 0.4], [0.1, 0.3, 0.6], ]) r = util.remove_noise_from_class(nm, 0) assert(np.all(r - nm < 1e-4))
def fit( self, X, s, psx = None, thresholds = None, noise_matrix = None, inverse_noise_matrix = None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X = X, s = s, clf = self.clf, cv_n_folds = self.cv_n_folds, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, seed = self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s = s, psx = psx, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X = X, labels = s, clf = self.clf, cv_n_folds = self.cv_n_folds, seed = self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning, ) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix = self.inverse_noise_matrix, confident_joint = self.confident_joint, prune_method = self.prune_method, ) if self.pulearning is not None: self.noise_mask[s != self.pulearning] = False return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifer (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace( inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format( t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the noise matrix you provde." warnings.warn(w) self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix( self.ps, self.noise_matrix) if inverse_noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(inverse_noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' inv noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the inverse noise matrix you provde." warnings.warn(w) self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, prune_count_method=self.prune_count_method, converge_latent_estimates=self.converge_latent_estimates, ) X_mask = ~self.noise_mask X_pruned = X[X_mask] s_pruned = s[X_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr( inspect, 'getfullargspec' ) and 'sample_weight' in inspect.getfullargspec( self.clf.fit).args or hasattr( inspect, 'getargspec') and 'sample_weight' in inspect.getargspec( self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): self.sample_weight[s_pruned == k] = 1.0 / self.noise_matrix[k][k] self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but its all we can do if sample_weight isn't available. self.clf.fit(X_pruned, s_pruned) return self.clf