def test_clf_fit_nm_inm(sparse):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=SEED)
    nm = data['noise_matrix']
    inm = compute_inv_noise_matrix(
        data["py"],
        nm,
        data["ps"],
    )
    lnl.fit(
        X=data['X_train'],
        s=data['s'],
        noise_matrix=nm,
        inverse_noise_matrix=inm,
    )
    score_nm_inm = lnl.score(data['X_test'], data['y_test'])

    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=SEED)
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    assert (score < score_nm_inm + 1e-4)
Beispiel #2
0
def __model_build_noisy(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')
    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    return clf.predict(X_test)
def test_fit_with_inm(
    prune_count_method='inverse_nm_dot_s',
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)
def test_fit_with_inm(
    sparse,
    seed=SEED,
    used_by_another_test=False,
):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=seed, )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)
def test_rp():
    rp = LearningWithNoisyLabels(clf=LogisticRegression(
        multi_class='auto', solver='lbfgs', random_state=seed))
    rp.fit(data["X_train"], data["s"])
    score = rp.score(data["X_test"], data["y_test"])
    print(score)
    # Check that this runs without error.
    assert (True)
def test_raise_error_no_clf_predict():
    class struct(object):
        def fit(self):
            pass

        def predict_proba(self):
            pass

    try:
        LearningWithNoisyLabels(clf=struct())
    except Exception as e:
        assert ('predict' in str(e))
        with pytest.raises(ValueError) as e:
            LearningWithNoisyLabels(clf=struct())
def test_no_score():
    class Struct():
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data['y_test']

    lnl = LearningWithNoisyLabels(clf=Struct())
    score = lnl.score(data['X_test'], data['y_test'])
    assert (abs(score - 1) < 1e-6)
Beispiel #8
0
def train_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedにだけpseudo
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    # きれいにしたtrain dataでtrain
    model.fit(X_train, y_train_pseudo)

    return model.score(X_test, y_test)
def test_fit_psx():
    from cleanlab.latent_estimation import estimate_cv_predicted_probabilities
    lnl = LearningWithNoisyLabels()
    psx = estimate_cv_predicted_probabilities(
        X=data['X_train'],
        labels=data['y_train'],
    )
    lnl.fit(X=data['X_train'], s=data['y_train'], psx=psx)
    score_with_psx = lnl.score(data['X_test'], data['y_test'])
    lnl = LearningWithNoisyLabels()
    lnl.fit(
        X=data['X_train'],
        s=data['y_train'],
    )
    score_no_psx = lnl.score(data['X_test'], data['y_test'])
    assert (abs(score_with_psx - score_no_psx) < 1e-6)
def test_score():
    phrase = 'cleanlab is dope'

    class Struct():
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self):
            pass

        def score(self, X, y):
            return phrase

    lnl = LearningWithNoisyLabels(clf=Struct())
    score = lnl.score(data['X_test'], data['y_test'])
    assert (score == phrase)
def test_fit_with_nm(
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(seed=seed, )
    nm = data['noise_matrix']
    # Learn with noisy labels with noise matrix given
    lnl.fit(data['X_train'], data['s'], noise_matrix=nm)
    score_nm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_nm
    else:
        assert (score < score_nm + 1e-4)
def test_no_fit_sample_weight():
    class Struct():
        def fit(self, X, y):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data['y_test']

    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    psx = np.zeros(shape=(n, m))
    lnl = LearningWithNoisyLabels(clf=Struct())
    lnl.fit(data['X_train'],
            data['y_train'],
            psx=psx,
            noise_matrix=data['noise_matrix'])
    # If we make it here, without any error:
    assert (True)
def test_clf_fit_nm():
    lnl = LearningWithNoisyLabels()
    # Example of a bad noise matrix (impossible to learn from)
    nm = np.array([[0, 1], [1, 0]])
    try:
        lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)
    except Exception as e:
        assert ('Trace(noise_matrix)' in str(e))
        with pytest.raises(ValueError) as e:
            lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)
def test_pred_and_pred_proba():
    lnl = LearningWithNoisyLabels()
    lnl.fit(data['X_train'], data['s'])
    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    pred = lnl.predict(data['X_test'])
    probs = lnl.predict_proba(data['X_test'])
    # Just check that this functions return what we expect
    assert (np.shape(pred)[0] == n)
    assert (np.shape(probs) == (n, m))
Beispiel #15
0
def __model_build_noisy_pseudo(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')

    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)

    # Pseudo-labelling
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_pseudo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_pseudo])
    X_for_pseudo = np.vstack([X_train, X_test])
    model.fit(X_for_pseudo, y_pseudo)
    return model.predict(X_test)
Beispiel #16
0
def train_test_and_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedとtestの両方をpseudoにする
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_psuedo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo])
    X_for_pseudo = sp.vstack([X_train, X_test])

    # pseudo込の全データでtrain
    model.fit(X_for_pseudo, y_pseudo)

    return model.score(X_test, y_test)
Beispiel #17
0
 np.random.seed(seed=0)
 clf_copy = copy.deepcopy(clf)
 # Compute p(y=k), the ground truth class prior on the labels.
 py = np.bincount(y_train) / float(len(y_train))
 # Generate the noisy channel to characterize the label errors.
 noise_matrix = generate_noise_matrix_from_trace(
     K=num_classes,
     trace=num_classes * avg_trace,
     py=py,
     frac_zero_noise_rates=frac_zero_noise_rates,
 )
 print_noise_matrix(noise_matrix)
 # Create the noisy labels. This method is exact w.r.t. the noise_matrix.
 y_train_with_errors = generate_noisy_labels(y_train, noise_matrix)
 lnl_cv = GridSearch(
     model=LearningWithNoisyLabels(clf),
     param_grid=param_grid,
     num_threads=4,
     seed=0,
 )
 lnl_cv.fit(
     X_train=X_train,
     y_train=y_train_with_errors,
     X_val=X_val,
     y_val=y_val,
     verbose=False,
 )
 # Also compute the test score with default parameters
 clf_copy.fit(X_train, y_train_with_errors)
 score_opt = lnl_cv.model.score(X_test, y_test)
 score_default = clf_copy.score(X_test, y_test)
Beispiel #18
0
def clean_labels(X: pd.DataFrame,
                 y,
                 count_start,
                 pulearning=None,
                 strategy="cut",
                 round=0,
                 early_stop=False):
    count = count_start
    from preprocess import sample
    cols = [
        c for c in X if len(c.split("_")) == 2 and (
            c.startswith("c_") or c.startswith("n_"))
    ]
    print(cols)

    while count <= count_start + round:
        try:
            params = {
                "objective": "binary",
                "metric": "auc",
                "verbosity": -1,
                "seed": count,
                "num_threads": 4,
                "num_boost_round": 50
            }

            X_sample, y_sample = sample(X[cols], y, 30000, random_state=count)
            hyperparams = _hyperopt(X_sample,
                                    y_sample,
                                    params,
                                    random_state=count)
            # confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
            #     X=X.values,
            #     s=1 * (y.values == 1),
            #     clf=lgb.LGBMClassifier(**hyperparams, **params),  # default, you can use any classifier
            #     seed=count,
            # )
            # est_py, est_nm, est_inv = estimate_latent(confident_joint, s=1 * (y.values == 1))

            model = LearningWithNoisyLabels(
                lgb.LGBMClassifier(**hyperparams, **params),
                seed=count,
                cv_n_folds=5,
                prune_method="both",  # 'prune_by_noise_rate',
                converge_latent_estimates=True,
                pulearning=pulearning)
            print(X.shape, len(y))
            # import pdb;pdb.set_trace()
            noisy, noise_matrix, inverse_noise_matrix, confident_joint, psx = model.fit(
                X[cols].values, 1 * (y.values == 1), thresholds=None)
            # noise_matrix=est_nm,
            # inverse_noise_matrix=est_inv, )
            if count == count_start:

                rou_0 = noise_matrix[1, 0]
                rou_1 = noise_matrix[0, 1]

                print(rou_0, rou_1)
                if early_stop and rou_0 + rou_1 <= 0.9:
                    break
            if len(noisy) <= 0:
                break
            print(len([x for x in noisy if x == True]))

            if strategy == "cut":
                X = X[~noisy]
                y = y[~noisy]
            else:
                X = X[~noisy]
                y = y[~noisy]

        except Exception as exp:
            print("error:", exp)
        finally:
            count += 1

    return X, y, rou_0 + rou_1, rou_0, rou_1
Beispiel #19
0
    def fit(self, X_full, y_full, time_remain):

        start_fit = time.time()
        # SEED = 2019

        # for SEED in range(2019, self.iter + 2019):
        SEED = 2019
        budget = time_remain - (time.time() - start_fit)

        best_iter = []
        while True:
            try:
                print(SEED, budget)
                round_start = time.time()

                self.hyper_seed = SEED
                params = {
                    "objective": "binary",
                    "metric": "auc",
                    "verbosity": -1,
                    "seed": self.hyper_seed,
                    "num_threads": 4,
                    "num_boost_round": 500
                }

                X, y = downsampling(X_full,
                                    y_full,
                                    sum(y_full) * self.sample_rto,
                                    seed=self.hyper_seed)
                # X_sample, y_sample = sample(X, y, 30000, random_state=self.hyper_seed)
                hyperparams = self._hyperopt(X,
                                             y,
                                             params,
                                             random_state=self.hyper_seed)

                X_train, X_val, y_train, y_val = train_test_split(
                    X, y, test_size=0.1, random_state=self.hyper_seed)

                watchlist = [(X_train, y_train), (X_val, y_val)]

                _model = lgb.LGBMClassifier(**hyperparams, **params)
                _model.fit(X_train,
                           y_train,
                           early_stopping_rounds=30,
                           eval_set=watchlist,
                           verbose=100)

                params["num_boost_round"] = _model.best_iteration_
                best_iter.append(_model.best_iteration_)

                confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
                    X=X.values,
                    s=1 * (y.values == 1),
                    clf=lgb.LGBMClassifier(**hyperparams, **params),
                    seed=SEED,
                )

                est_py, est_nm, est_inv = estimate_latent(confident_joint,
                                                          s=1 *
                                                          (y.values == 1))

                self.model = LearningWithNoisyLabels(
                    lgb.LGBMClassifier(**hyperparams, **params),
                    seed=1,
                    cv_n_folds=5,
                    prune_method="both",  # 'prune_by_noise_rate',
                    converge_latent_estimates=True,
                    pulearning=1)

                self.model.fit(
                    X.values,
                    1 * (y.values == 1),
                    psx=psx,
                    thresholds=None,
                    noise_matrix=est_nm,
                    inverse_noise_matrix=est_inv,
                )

                self.models.append(self.model)

                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1
            except:
                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1

        print(best_iter)
        return self
from cleanlab.classification import LearningWithNoisyLabels
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import value_counts
from cleanlab.latent_algebra import compute_inv_noise_matrix


# ## **rankpruning** is the first practical *(works for any classifier, runs fast, robust to poor probability estimation)* algorithm for multiclass learning with noisy labels. Its comprised of components from the theory and algorithsm of **confident learning**. It's a Python class that wraps around any classifier as long as .fit(X, y, sample_weight), .predict(X), .predict_proba(X) are defined. Inspect the **cleanlab** package for documentation.
# 
# ## Here we show the performance of multiclass rankpruning wrapped around a sklearn LogisiticRegression classifier versus LogisticRegression without any help from confident learning on the Iris dataset.

# In[16]:


# Seed for reproducibility
seed = 2
rp = LearningWithNoisyLabels(clf = logreg(), seed = seed)
np.random.seed(seed = seed)

# Get iris dataset
iris = datasets.load_iris()
X = iris.data  # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt
    _ = plt.figure(figsize=(12,8))
    color_list = plt.cm.tab10(np.linspace(0, 1, 6))
    _ = plt.scatter(X_train[:,1], X_train[:,3], color = [color_list[z] for z in y_train], s = 50)  
Beispiel #21
0
    # clf = LearningWithNoisyLabels(clf=GaussianProcessClassifier(kernel= kernel, max_iter_predict=1000, multi_class='one_vs_rest'))

    # clf = GaussianProcessClassifier(kernel= kernel, multi_class='one_vs_rest')
    # clf = LearningWithNoisyLabels(GaussianProcessClassifier(kernel= kernel, multi_class='one_vs_rest'))
    # clf = LearningWithNoisyLabels(clf = RandomForestClassifier())
    # clf =  LogisticRegression(penalty="l1", solver="liblinear")
    # clf = GradientBoostingClassifier()
    # clf = SVC(probability=True)
    # clf = GaussianProcessClassifier(kernel=kernel, multi_class='one_vs_rest')

    ratioList = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
    scoreList = []
    noiseScoreist = []
    for ratio in ratioList:
        clf = RandomForestClassifier()
        clfNoise = LearningWithNoisyLabels(clf=RandomForestClassifier())
        newTrainX = trainX
        newTrainY = copy.deepcopy(trainY)
        for i in range(len(newTrainX)):
            if (random.random() < ratio):
                while True:
                    noiseLabel = random.randint(1, 4) - 1
                    # print('trainY[i] : ', newTrainY[i], 'noiseLabel :',noiseLabel)
                    if newTrainY[i] != noiseLabel:
                        newTrainY[i] = noiseLabel
                        break

        clf.fit(newTrainX, newTrainY)
        clfNoise.fit(newTrainX, newTrainY)
        # importances = clf.feature_importances_
        # indices = np.argsort(importances)[::-1]
Beispiel #22
0
    '% confident learning errors that are actual errors: {:.0%}'.format(score))

# original lr f1

print('WITHOUT confident learning,', end=" ")

clf.fit(X_train, s)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (psx not given),', end=" ")
rp = LearningWithNoisyLabels(clf=clf)
rp.fit(X_train, s)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH confident learning (psx given),', end=" ")
rp.fit(X=X_train, s=s, psx=psx)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH all label right,', end=" ")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("-------------------")
Beispiel #23
0
def denoiseA(data_cor, rho, mode):
    '''
    Denoise the corrupted sensitive attribute using RankPrune.
    '''

    rho_a_plus, rho_a_minus = rho

    dataX = data_cor[0]
    cor_dataA = data_cor[2]
    # dataA = data_cor[5]
    #
    # auc3, auc4 = None, None

    noise_matrix = np.array([[1 - rho_a_minus, rho_a_plus],
                             [rho_a_minus, 1 - rho_a_plus]])
    # noise_matrix = None

    lnl = LearningWithNoisyLabels(clf=LogisticRegression(
        random_state=0, solver='lbfgs', multi_class='auto'))
    lnl.fit(X=dataX.values, s=cor_dataA.values, noise_matrix=noise_matrix)

    # Logistic Regression Baseline
    # lnl = clf=LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
    # lnl.fit(X = dataX.values, y = cor_dataA.values)

    denoised_dataA = pd.Series(lnl.predict(dataX.values))
    data_denoised = copy.deepcopy(data_cor)
    data_denoised[2] = denoised_dataA

    # print(lnl.noise_matrix, rho_a_plus, rho_a_minus)

    # Check recovery accuracy
    # auc1 = np.mean(dataA.values==cor_dataA.values)
    # auc2 = np.mean(dataA.values==denoised_dataA.values)

    # The following is under development.
    rho_est = None
    data_denoised_est = None

    if mode == 'six':

        lnl2 = LearningWithNoisyLabels(
            LogisticRegression(random_state=0,
                               solver='lbfgs',
                               multi_class='auto'))
        lnl2.fit(X=dataX.values, s=cor_dataA.values)

        denoised_dataA_est = pd.Series(lnl2.predict(dataX.values))
        data_denoised_est = copy.deepcopy(data_cor)
        data_denoised_est[2] = denoised_dataA_est

        rho_a_plus_est = lnl2.noise_matrix[0][1]
        rho_a_minus_est = lnl2.noise_matrix[1][0]
        rho_est = [rho_a_plus_est, rho_a_minus_est]

        # print(lnl2.noise_matrix, rho_a_plus_est, rho_a_minus_est)

        # lnl3 = LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
        # lnl3.fit(dataX.values, cor_dataA.values)

        # pred_dataA = pd.Series(lnl3.predict(dataX.values))
        # auc3 = np.mean(dataA.values==denoised_dataA_est.values)
        # auc4 = np.mean(dataA.values==pred_dataA.values)

    # print('auc:', auc1, auc2, auc3, auc4)

    return data_denoised, data_denoised_est, rho_est
    for name, clf in zip(names, classifiers):
        # Create four copies of the classifier.
        # perf_label_clf - Will be trained on the hidden, noise-free labels
        # noisy_clf - Will be trained on the noisy labels
        # noisy_clf_w_rp - Will be trained on the noisy labels using LearningWithNoisyLabels

        clfs = [copy.deepcopy(clf) for i in range(len(experiments))]
        perf_label_clf, noisy_clf, noisy_clf_w_rp = clfs
        # Classifier (trained without label errors)
        perf_label_clf.fit(X_train, y_train)
        perf_label_score = perf_label_clf.score(X_test, y_test)
        # Classifier (trained with label errors)
        noisy_clf.fit(X_train, y_train_w_errors)
        noisy_score = noisy_clf.score(X_test, y_test)
        # Classifier + RP (trained with label errors)
        rp = LearningWithNoisyLabels(noisy_clf_w_rp)
        rp.fit(X_train, y_train_w_errors)
        noisy_score_w_rp = rp.clf.score(X_test, y_test)

        # Store results for each classifier in a dict with key = clf_name.
        clf_results[name] = {
            'clfs': clfs,
            "perf_label_score": perf_label_score,
            "noisy_score": noisy_score,
            "noisy_score_w_rp": noisy_score_w_rp,
        }

    results.append({
        "X": X,
        "X_train": X_train,
        "y_train": y_train,
Beispiel #25
0
print('The actual, latent, underlying noise matrix.')
print_noise_matrix(noise_matrix)
print('Our estimate of the noise matrix.')
print_noise_matrix(est_noise_matrix)
print()
print('The actual, latent, underlying joint distribution matrix.')
cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors)
print('Our estimate of the joint distribution matrix.')
cleanlab.util.print_joint_matrix(est_joint)
print("Accuracy Comparison")
print("-------------------")
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))
print("Logistic regression:", baseline_score)
rp = LearningWithNoisyLabels(seed=seed)
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
print("Logistic regression (+rankpruning):", rp_score)
diff = rp_score - baseline_score
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
print(
    'Fit on denoised data without re-weighting:',
    accuracy_score(
        y_test,
        clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test)))

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    print("\n\n\n\n\n\n")
def test_default_clf():
    lnl = LearningWithNoisyLabels()
    return lnl.clf is not None and hasattr(lnl.clf, 'fit') and hasattr(
        lnl.clf, 'predict') and hasattr(lnl.clf, 'predict_proba')
def test_seed():
    lnl = LearningWithNoisyLabels(seed=0)
    assert (lnl.seed is not None)
Beispiel #28
0
def test_default_clf():
    lnl = LearningWithNoisyLabels()
    check1 = lnl.clf is not None and hasattr(lnl.clf, 'fit')
    check2 = hasattr(lnl.clf, 'predict') and hasattr(lnl.clf, 'predict_proba')
    assert (check1 and check2)
Beispiel #29
0
# Work around indexing bug
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# A_test = A_test.map({ 0:"female", 1:"male"})

# flip across different groups
Y_noised = flip(Y_train, A_train, error_rate=error_rate)
noise_matrix = generate_noise_matrix(Y_noised, Y_train)
est_error_rate = estimation(X_train.values, Y_noised, A_train.values, ngroups=2**args.ngroups)
print(f"True error rate is {error_rate}.\nEstimated error rate is {est_error_rate}.")

# Learning with Noisy Labels
lnl = LearningWithNoisyLabels(clf=LogisticRegression())
lnl.fit(X=X_train.values, s=Y_noised, noise_matrix=noise_matrix)
Y_lnlt = lnl.predict(X_train.values).astype(int)
lnl.fit(X=X_train.values, s=Y_noised)
Y_lnle = lnl.predict(X_train.values).astype(int)


def run_corrupt(fairness_constraints):
    all_results = {}
    all_results['eps'] = fairness_constraints
    all_results['accuracy'] = {
        'train': [],
        'test': []
    }

    all_results['violation'] = {
Beispiel #30
0
# learning with noisy labels. Its comprised of components from the theory and
# algorithms of **confident learning**. It's a Python class that wraps around
# any classifier as long as .fit(X, y, sample_weight),
# .predict(X), .predict_proba(X) are defined.
# See https://l7.curtisnorthcutt.com/cleanlab-python-package for docs.
#
#
# ## Here we show the performance with LogisiticRegression classifier
# ## versus LogisticRegression \*without\* cleanlab on the Iris dataset.

# In[2]:

# Seed for reproducibility
seed = 2
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=clf, seed=seed)
np.random.seed(seed=seed)

# Get iris dataset
iris = datasets.load_iris()
X = iris.data  # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    _ = plt.figure(figsize=(12, 8))
    color_list = plt.cm.tab10(np.linspace(0, 1, 6))
    _ = plt.scatter(X_train[:, 1],