Esempio n. 1
0
def test_pruning_both(n_jobs):
    remove = 5
    s = data['s']
    class_idx = pruning.get_noise_indices(
        s=s,
        psx=data['psx'],
        num_to_remove_per_class=remove,
        prune_method='prune_by_class',
        n_jobs=n_jobs,
    )
    nr_idx = pruning.get_noise_indices(
        s=s,
        psx=data['psx'],
        num_to_remove_per_class=remove,
        prune_method='prune_by_noise_rate',
        n_jobs=n_jobs,
    )
    both_idx = pruning.get_noise_indices(
        s=s,
        psx=data['psx'],
        num_to_remove_per_class=remove,
        prune_method='both',
        n_jobs=n_jobs,
    )
    assert (all(s[both_idx] == s[class_idx & nr_idx]))
Esempio n. 2
0
def test_prune_count_err():
    try:
        pruning.get_noise_indices(
            s=data['s'],
            psx=data['psx'],
            prune_count_method='INVALID_METHOD',
        )
    except ValueError as e:
        assert ('should be' in str(e))
    with pytest.raises(ValueError) as e:
        pruning.get_noise_indices(
            s=data['s'],
            psx=data['psx'],
            prune_count_method='INVALID_METHOD',
        )
Esempio n. 3
0
def test_exact_prune_count():
    remove = 5
    s = data['s']
    noise_idx = pruning.get_noise_indices(s=s,
                                          psx=data['psx'],
                                          num_to_remove_per_class=remove)
    assert (all(value_counts(s[noise_idx]) == remove))
Esempio n. 4
0
 def find_noise(self, labeled_sample_list):
     # get data feature
     labeled_data_label = [
         i.human_label for i in labeled_sample_list if i.human_label
     ]
     labeled_data_feature = [
         i.feature.toarray().tolist()[0] for i in labeled_sample_list
     ]
     # find noise(maybe error index)
     s = np.array([self.label_id[i] for i in labeled_data_label])
     X = np.array(labeled_data_feature)
     psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
         X,
         s,
         clf=LogisticRegression(max_iter=1000,
                                multi_class='auto',
                                solver='lbfgs'))
     ordered_label_errors = get_noise_indices(
         s=s,
         psx=psx,
         sorted_index_method='normalized_margin',  # Orders label errors
     )
     logger.debug(
         '[find_noise] ordered_label_errors index: {}, size: {}'.format(
             ordered_label_errors, len(ordered_label_errors)))
     noise_samples = [labeled_sample_list[i] for i in ordered_label_errors]
     return noise_samples
Esempio n. 5
0
def test_prune_on_small_data():
    data = make_data(sizes=[4, 4, 4])
    for pm in ['prune_by_noise_rate', 'prune_by_class', 'both']:
        noise_idx = pruning.get_noise_indices(
            s=data['s'],
            psx=data['psx'],
            prune_method=pm,
        )
        # Num in each class < 5. Nothing should be pruned.
        assert (not any(noise_idx))
def test_pruning_order_method():
    order_methods = ["prob_given_label", "normalized_margin"]
    results = []
    for method in order_methods:
        results.append(pruning.get_noise_indices(
            s=data['s'],
            psx=data['psx'],
            sorted_index_method=method,
        ))
    assert(len(results[0]) == len(results[1]))
Esempio n. 7
0
def main(output_filepath, noice_pct):
    '''
    Evaluation script and write the noisy indices to a separate csv file
    '''

    logging.info(f"Default config : {json.dumps(config)}")

    # Loading the data
    logging.info('Loading the data...')
    source, df = load_data(URLs.IMAGENETTE)

    # Default DataLoaders using 5 as noise_percent
    dls: DataLoaders = get_dls(df, noice_pct=noice_pct, pref=source, size=224)

    learn: Learner = train(dls, f'resnet34_5ep_3frzep_1e_3_{noice_pct}np.pkl')


    # Predict using a single image file
    # learn = load_learner('export.pkl')
    # learn.predict('TEST_IMAGE_FILE')

    # Determine the noisy indices on training/validation or an unknown test dataset
    # Training 
    logging.info('Getting the predictions for training data...')
    train_preds = learn.get_preds(ds_idx=0, with_decoded=True)

    # Add Predictions & Confidence Score
    decoded_train_preds = get_inverse_transform(L(list(train_preds[2])).map(dls.vocab))
    
    # Add confidence & confidenceRange
    confidence = torch.max(train_preds[0], axis=-1).values

    # Noisy indices from training dataset.
    train_ordered_label_errors = get_noise_indices(s=train_preds[1].numpy(), #targets
                             psx=train_preds[0].numpy(),#predictions_prob
                             prune_method="both", # 'prune_by_noise_rate': works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py).
                                                              #'prune_by_class': works by removing the examples with *smallest probability* of belonging to their given class label for every class.
                             sorted_index_method='normalized_margin')

    # Actual Noise in the training dataset
    train_df = df[df.is_valid == False].copy()
    print("We found {} label errors in the training dataset of size {}.".format(len(train_ordered_label_errors), len(train_df)))

    train_df['predictions'] = np.array(decoded_train_preds)
    train_df['confidence'] = confidence
    noisy_train = train_df.iloc[train_ordered_label_errors]

    PREDICTIONS_NAME = f'noisy{noice_pct}_train_predictions.csv'
    logging.info(f'Saving the noisy training data with predictions and confidence at {output_filepath}/{PREDICTIONS_NAME}')
    noisy_train.to_csv(f'{output_filepath}/{PREDICTIONS_NAME}', index=False)
Esempio n. 8
0
def test_get_noise_indices_multi_label():
    s_ml = [[z, data['y_train'][i]] for i, z in enumerate(data['s'])]
    for multi_label in [True, False]:
        for prune_method in ['prune_by_class', 'prune_by_noise_rate']:
            noise_idx = pruning.get_noise_indices(
                s=s_ml if multi_label else data['s'],
                psx=data['psx'],
                prune_method=prune_method,
                multi_label=multi_label,
            )
            acc = np.mean((data['s'] != data['y_train']) == noise_idx)
            # Make sure cleanlab does reasonably well finding the errors.
            # acc is the accuracy of detecting a label error.
            assert (acc > 0.85)
Esempio n. 9
0
def evaluate(df, data_path):

    # preprocess test data
    test_df = remove_stop_add_hashtag(df)

    # tokenize
    tokenized_df = tokenize_df(test_df,
                               text_cols=["HashTag", "Text"],
                               mark_fields=True,
                               tok_text_col='text')  #returns a tuple

    # load the saved model
    learn = load_learner(f'{MODELS}/{cfg.model_name}')

    # test dataloader
    test_dl = learn.dls.test_dl(tokenized_df[0])

    # predictions
    result = learn.get_preds(dl=test_dl)

    confidence = torch.max(result[0], axis=1).values
    _, y = learn.dls.valid.vocab
    y_predicted = np.array(y[result[0].argmax(axis=1)])

    test_df['predicted'] = y_predicted
    test_df['confidence'] = confidence

    # metrics
    # _, metric_value = learn.validate(dl=test_dl) #loss, metrics used
    # metrics = {each.name: metric_value[idx] for idx, each in enumerate(learn.metrics)}
    # print(f"Metrics on the test dataset : {metrics}")

    # Noisy Labels on Test Dataframe
    test_ordered_label_errors = get_noise_indices(
        s=Numericalize(vocab=['INFORMATIVE', 'UNINFORMATIVE'])(
            test_df.Label).numpy(),
        psx=result[0].numpy(),
        prune_method=
        "both",  # 'prune_by_noise_rate': works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py).
        #'prune_by_class': works by removing the examples with *smallest probability* of belonging to their given class label for every class.
        sorted_index_method='normalized_margin')

    print(test_ordered_label_errors)
    test_df.iloc[test_ordered_label_errors].to_csv(
        f'{data_path}/noisy_text.csv')
Esempio n. 10
0
def del_mis_label(use_cuda, testloader):
    model.eval()
    correctAll = 0
    totalAll = 0
    with torch.no_grad():
        for batch, (inputs, targets) in enumerate(testloader):
            pred = np.zeros(shape=(targets.shape[0], 10))
            label = targets.numpy()
            if use_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            inputs, targets = torch.autograd.Variable(
                inputs), torch.autograd.Variable(targets)

            for i in range(targets.shape[0] // 100):
                outputs = model(inputs[i * 100:(i + 1) * 100, :, :, :])
                pred[i * 100:(i + 1) * 100, :] = outputs.data.cpu()
                # loss = criterion(outputs, targets) + mcloss(feature, targets) * 10
                # loss = bi_tempered_logistic_loss(activations=activations, labels=labels, t1=0.7, t2=1.3)
                correct, total = accuracy(outputs.data,
                                          targets[i * 100:(i + 1) * 100].data)
                correctAll += correct
                totalAll += total
                print('using:    batch=%04d' % batch,
                      '  accuracy=%.2f' % (correct / total * 100.0),
                      end='\r')
        print('USING:    batch=%04d' % batch,
              '  accuracy=%.2f' % (correctAll / totalAll * 100.0))
    f = open(log_txt, 'a+')
    line = str(batch) + "  " + str((correctAll / totalAll * 100.0)) + '\n'
    f.write(line)
    f.close()

    print('')
    from cleanlab.pruning import get_noise_indices
    ordered_label_errors = get_noise_indices(
        s=label,
        psx=pred,
        sorted_index_method='normalized_margin',  # Orders label errors
        n_jobs=1)

    return correctAll / totalAll * 100.0, ordered_label_errors
Esempio n. 11
0
def baseline_argmax_confusion_matrix(
    psx,
    s,
    calibrate=False,
    prune_method='prune_by_noise_rate',
):
    '''This is a baseline approach. That uses the a confusion matrix
    of argmax(psx) and s as the confident joint and then uses cleanlab
    (confident learning) to find the label errors using this matrix.

    Parameters
    ----------

    s : np.array
        A discrete vector of noisy labels, i.e. some labels may be erroneous.

    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the
        N examples x. This is the probability distribution over all K classes,
        for each example, regarding whether the example has label s==k P(s=k|x).
        psx should have been computed using 3 (or higher) fold cross-validation.

    Returns
    -------
        A boolean mask that is true if the example belong
        to that index is label error..'''

    confident_joint = confusion_matrix(np.argmax(psx, axis=1), s).T
    if calibrate:
        confident_joint = calibrate_confident_joint(confident_joint, s)
    return get_noise_indices(
        s=s,
        psx=psx,
        confident_joint=confident_joint,
        prune_method=prune_method,
    )
Esempio n. 12
0
        if not os.path.exists('results'):
            os.makedirs('results')
        with open(result_latent_vars, 'wb') as output:
            pickle.dump(est_py, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(est_nm, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(est_inv, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(confident_joint, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(psx, output, pickle.HIGHEST_PROTOCOL)
    else:
        with open(result_latent_vars, 'rb') as inf:
            est_py = pickle.load(inf)
            est_nm = pickle.load(inf)
            est_inv = pickle.load(inf)
            confident_joint = pickle.load(inf)
            psx = pickle.load(inf)

    # print flipped labels
    label_errors = get_noise_indices(
        s=train_labels_with_errors,  # required
        psx=psx,  # required
        inverse_noise_matrix=
        est_inv,  # not required, include to avoid recomputing
        confident_joint=
        confident_joint,  # not required, include to avoid recomputing
    )
    print(
        pd.concat([
            train_labels_with_errors, train_true_labels,
            pd.DataFrame(data=label_errors, columns=['flipped_label'])
        ],
                  axis=1))
Esempio n. 13
0
    os.system(f'mv {old_path} {new_path}')

# Cell
mnist = DataBlock(blocks=(ImageBlock(cls=PILImageBW), CategoryBlock),
                  get_items=get_image_files,
                  splitter=GrandparentSplitter(train_name='training',
                                               valid_name='testing'),
                  get_y=parent_label)
dls = mnist.dataloaders(path, bs=16)
dls.show_batch(max_n=36, figsize=(6, 6))

# Cell
learn = cnn_learner(dls,
                    resnet18,
                    metrics=accuracy,
                    loss_func=LabelSmoothingCrossEntropyFlat())

# Cell
learn.fine_tune(1, 1e-3)

# Cell
val_preds = learn.get_preds(ds_idx=1, with_decoded=True)

# Cell
from cleanlab.pruning import get_noise_indices

# Cell
val_ordered_label_errors = get_noise_indices(
    s=val_preds[1].numpy(),
    psx=val_preds[0].numpy(),
    sorted_index_method='normalized_margin')
Esempio n. 14
0
    def fit(
        self, 
        X,
        s,
        psx = None,
        thresholds = None,
        noise_matrix = None,
        inverse_noise_matrix = None, 
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling.

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 

        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes. 
        # Also, if needed, compute P(s=k|x), denoted psx.
        
        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix)

        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix)

        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \
                estimate_py_noise_matrices_and_cv_pred_proba(
                    X = X,
                    s = s,
                    clf = self.clf,
                    cv_n_folds = self.cv_n_folds,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                    seed = self.seed,
                )
            else: # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \
                estimate_py_and_noise_matrices_from_probabilities(
                    s = s, 
                    psx = psx,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                )

        if psx is None: 
            psx = estimate_cv_predicted_probabilities(
                X = X,
                labels = s,
                clf = self.clf,
                cv_n_folds = self.cv_n_folds,
                seed = self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None: # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix,
                class_without_noise=self.pulearning,
            )
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix = self.inverse_noise_matrix,
            confident_joint = self.confident_joint,
            prune_method = self.prune_method,
        ) 
        if self.pulearning is not None:
            self.noise_mask[s != self.pulearning] = False
        return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
Esempio n. 15
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        """This method implements the confident learning. It counts examples
        that are likely labeled correctly and incorrectly and uses their ratio
        to create a predicted confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for
        the noise in both the positive and negative sets.

        Parameters
        ----------
        X : :obj:`np.array`
          Input feature matrix (N, D), 2D numpy array

        s : :obj:`np.array`
          A binary vector of labels, s, which may contain mislabeling.

        psx : :obj:`np.array` (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
          examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx
          should have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : :obj:`iterable` (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). List of probabilities used to determine the cutoff
          predicted probability necessary to consider an example as a given
          class label.
          Default is ``None``. These are computed for you automatically.
          If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. Values in list should be between 0 and 1.

        noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1.

        inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Returns
        -------
        tuple
          (noise_mask, sample_weight)"""

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix)
                                                 <= 1):
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = (
                    compute_py_inv_noise_matrix(self.ps, self.noise_matrix))
        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps,
                    self.inverse_noise_matrix,
                )
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint, psx = \
                    estimate_py_noise_matrices_and_cv_pred_proba(
                        X=X,
                        s=s,
                        clf=self.clf,
                        cv_n_folds=self.cv_n_folds,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                        seed=self.seed,
                    )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint = \
                    estimate_py_and_noise_matrices_from_probabilities(
                        s=s,
                        psx=psx,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                    )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # if pulearning == the integer specifying the class without noise.
        if self.K == 2 and self.pulearning is not None:  # pragma: no cover
            # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0
            self.noise_matrix[self.pulearning][1 - self.pulearning] = 0
            self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0
            self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0
            self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0
            self.confident_joint[self.pulearning][1 - self.pulearning] = 0
            self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            n_jobs=self.n_jobs,
        )

        x_mask = ~self.noise_mask
        x_pruned = X[x_mask]
        s_pruned = s[x_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(inspect, 'getfullargspec') and \
                'sample_weight' in inspect.getfullargspec(self.clf.fit).args \
                or hasattr(inspect, 'getargspec') and \
                'sample_weight' in inspect.getargspec(self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                sample_weight_k = 1.0 / self.noise_matrix[k][k]
                self.sample_weight[s_pruned == k] = sample_weight_k

            self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but best we can do if no sample_weight.
            self.clf.fit(x_pruned, s_pruned)

        return self.clf
Esempio n. 16
0
#!/usr/bin/env python
# -*- coding:gb18030 -*-
"""
File  :   mark_data_clean.py
Author:   [email protected]
Date  :   20/09/24 15:02:29
Desc  :   
"""

import sys
from cleanlab.pruning import get_noise_indices


def wrong_label_detect(pred_prob, given_label)
    wrong_label_indexs = get_noise_indices(
            s=given_label,
            psx=pred_prob,
            sorted_index_method='normalized_margin', # Orders label errors
            )
    return wrong_label_indexs


if __name__ == "__main__":
    pass


Esempio n. 17
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifer (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling. 

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 
    
        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the 
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(
                inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(
                    t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the noise matrix you provde."
                warnings.warn(w)
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(
                    self.ps, self.noise_matrix)
        if inverse_noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(inverse_noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' inv noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the inverse noise matrix you provde."
                warnings.warn(w)
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps, self.inverse_noise_matrix)
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba(
                    X=X,
                    s=s,
                    clf=self.clf,
                    cv_n_folds=self.cv_n_folds,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                    seed=self.seed,
                )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities(
                    s=s,
                    psx=psx,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None:  # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix, class_without_noise=self.pulearning)
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            prune_count_method=self.prune_count_method,
            converge_latent_estimates=self.converge_latent_estimates,
        )

        X_mask = ~self.noise_mask
        X_pruned = X[X_mask]
        s_pruned = s[X_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(
                inspect, 'getfullargspec'
        ) and 'sample_weight' in inspect.getfullargspec(
                self.clf.fit).args or hasattr(
                    inspect,
                    'getargspec') and 'sample_weight' in inspect.getargspec(
                        self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                self.sample_weight[s_pruned ==
                                   k] = 1.0 / self.noise_matrix[k][k]

            self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but its all we can do if sample_weight isn't available.
            self.clf.fit(X_pruned, s_pruned)

        return self.clf
Esempio n. 18
0
        label_id_map[l]: v
        for l, v in label_map.items() if l in label_id_map
    }

    # Be sure you compute probs in a holdout/out-of-sample manner (e.g. cross-validation)
    # Now getting label errors is trivial with cleanlab... its one line of code.
    # Label errors are ordered by likelihood of being an error. First index is most likely error.

    if preds.shape[0] > 100000:
        print('Large predictions take a long time. Only using top 100,000.')
        preds = preds[:100000, :]
        labels = labels[:100000]

    ordered_label_errors = pruning.get_noise_indices(
        s=labels,
        psx=preds,
        sorted_index_method='normalized_margin',  # Orders label errors
    )

    data_path = Path(args.data_dir)
    text_path = data_path / 'txt'

    class bcolors:
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
Esempio n. 19
0
    K,
    trace=1.5,
    py=py,
    valid_noise_matrix=True,
)

# Generate our noisy labels using the noise_marix.
s = generate_noisy_labels(y_train, noise_matrix)
ps = np.bincount(s) / float(len(s))

confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(X_train,
                                                                  s,
                                                                  seed=seed)
est_py, est_noise_matrix, est_inverse_noise_matrix = estimate_latent(
    confident_joint, s)
idx_errors = get_noise_indices(s, psx)

# #### To show off the power of **cleanlab**, we've chosen an example of multiclass learning with noisy labels in which over 50% of the training labels are wrong.
# Toggle the ```trace``` parameter in ```generate_noise_matrix_from_trace``` above to try out different amounts of noise. Note, as we prove in our paper, learning becomes impossible if the ```trace <= 1```, so choose a value greater than 1, but less than, or equal to, the number of classes (3).

# In[4]:

est_joint = cleanlab.latent_estimation.estimate_joint(
    s=s,
    psx=psx,
    confident_joint=confident_joint,
)
true_joint_distribution_of_label_errors = (noise_matrix * py)
percent_error_str = 'Percent of training examples that have wrong labels: ' + str(
    int(round(100 -
              100 * true_joint_distribution_of_label_errors.trace()))) + "%"
Esempio n. 20
0
# Cell
learn_5.save('learn_5')

# Cell
train_preds = learn_5.get_preds(ds_idx=0, with_decoded=True)

# Cell
val_preds = learn_5.get_preds(ds_idx=1, with_decoded=True)

# Cell
from cleanlab.pruning import get_noise_indices

# Cell
train_ordered_label_errors = get_noise_indices(
    s=train_preds[1].numpy(),  #targets
    psx=train_preds[0].numpy(),  #predictions_prob
    sorted_index_method='normalized_margin')

# Internal Cell
print("We found {} label errors.".format(len(train_ordered_label_errors)))

# Cell
noisy_train = train_df.loc[train_ordered_label_errors]

# Cell
train_preds = learn_50.get_preds(ds_idx=0, with_decoded=True)
from cleanlab.pruning import get_noise_indices
# Cell
# Get the noisy indices
train_ordered_label_errors = get_noise_indices(
    s=train_preds[1].numpy(),  #targets
Esempio n. 21
0
# That's all we need for confident learning.

# STEP 1 - Compute confident joint

# Verify inputs
s = np.asarray(s)
psx = np.asarray(psx)

# Find the number of unique classes if K is not given
K = len(np.unique(s))

from cleanlab.pruning import get_noise_indices

ordered_label_errors = get_noise_indices(
    s=s,
    psx=psx,
    sorted_index_method='normalized_margin',  # Orders label errors
)

print('orderd_label_errors:', ordered_label_errors)

print(np.array(sorted(ordered_label_errors)))
idx_errors = ordered_label_errors

label_errors_idx = np.array(sorted(ordered_label_errors))
score = sum([e in label_errors_idx
             for e in actual_label_errors]) / actual_num_errors
print('% actual errors that confident learning found: {:.0%}'.format(score))
score = sum([e in actual_label_errors
             for e in label_errors_idx]) / len(label_errors_idx)
print(
Esempio n. 22
0
    kv = line.split("\t")
    label = 0
    if float(kv[3]) > 3:
        #if float(kv[2]) > 3:
        label = 1
    y_true.append(label)
    y_scores.append([float(kv[5].split('|')[0]), float(kv[5].split('|')[1])])
    #y_scores.append([float(kv[4].split('|')[0]), float(kv[4].split('|')[1])])
    lines.append(line)
numpy_array_of_noisy_labels = np.array(y_true)
numpy_array_of_predicted_probabilities = np.array(y_scores)
#print (numpy_array_of_noisy_labels)
#print (numpy_array_of_predicted_probabilities)
ordered_label_errors = get_noise_indices(
    s=numpy_array_of_noisy_labels,
    psx=numpy_array_of_predicted_probabilities,
    sorted_index_method='normalized_margin',  # Orders label errors
)

print(ordered_label_errors[:10])
#index_dict = {}
#for i in np.arange(len(lines)):
#    index_dict[i] = lines[i].strip("\n")
#for  i in ordered_label_errors:
#    line = index_dict[i]
#    kv = line.split("\t")
#    #line = "%s\t%s\t%s\t%s"%(kv[0], kv[1], kv[2], float(kv[4].split('|')[1]))
#    print (line)
#exit()

#for i in np.arange(len(lines)):