def test_fit_with_inm(
    sparse,
    seed=SEED,
    used_by_another_test=False,
):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=seed, )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)
Exemple #2
0
def __model_build_noisy(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')
    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    return clf.predict(X_test)
def test_clf_fit_nm_inm(sparse):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=SEED)
    nm = data['noise_matrix']
    inm = compute_inv_noise_matrix(
        data["py"],
        nm,
        data["ps"],
    )
    lnl.fit(
        X=data['X_train'],
        s=data['s'],
        noise_matrix=nm,
        inverse_noise_matrix=inm,
    )
    score_nm_inm = lnl.score(data['X_test'], data['y_test'])

    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=SEED)
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    assert (score < score_nm_inm + 1e-4)
def test_fit_with_inm(
    prune_count_method='inverse_nm_dot_s',
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)
def test_rp():
    rp = LearningWithNoisyLabels(clf=LogisticRegression(
        multi_class='auto', solver='lbfgs', random_state=seed))
    rp.fit(data["X_train"], data["s"])
    score = rp.score(data["X_test"], data["y_test"])
    print(score)
    # Check that this runs without error.
    assert (True)
def test_clf_fit_nm():
    lnl = LearningWithNoisyLabels()
    # Example of a bad noise matrix (impossible to learn from)
    nm = np.array([[0, 1], [1, 0]])
    try:
        lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)
    except Exception as e:
        assert ('Trace(noise_matrix)' in str(e))
        with pytest.raises(ValueError) as e:
            lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)
def test_pred_and_pred_proba():
    lnl = LearningWithNoisyLabels()
    lnl.fit(data['X_train'], data['s'])
    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    pred = lnl.predict(data['X_test'])
    probs = lnl.predict_proba(data['X_test'])
    # Just check that this functions return what we expect
    assert (np.shape(pred)[0] == n)
    assert (np.shape(probs) == (n, m))
Exemple #8
0
def train_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedにだけpseudo
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    # きれいにしたtrain dataでtrain
    model.fit(X_train, y_train_pseudo)

    return model.score(X_test, y_test)
def test_fit_psx():
    from cleanlab.latent_estimation import estimate_cv_predicted_probabilities
    lnl = LearningWithNoisyLabels()
    psx = estimate_cv_predicted_probabilities(
        X=data['X_train'],
        labels=data['y_train'],
    )
    lnl.fit(X=data['X_train'], s=data['y_train'], psx=psx)
    score_with_psx = lnl.score(data['X_test'], data['y_test'])
    lnl = LearningWithNoisyLabels()
    lnl.fit(
        X=data['X_train'],
        s=data['y_train'],
    )
    score_no_psx = lnl.score(data['X_test'], data['y_test'])
    assert (abs(score_with_psx - score_no_psx) < 1e-6)
Exemple #10
0
def __model_build_noisy_pseudo(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')

    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)

    # Pseudo-labelling
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_pseudo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_pseudo])
    X_for_pseudo = np.vstack([X_train, X_test])
    model.fit(X_for_pseudo, y_pseudo)
    return model.predict(X_test)
Exemple #11
0
def train_test_and_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedとtestの両方をpseudoにする
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_psuedo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo])
    X_for_pseudo = sp.vstack([X_train, X_test])

    # pseudo込の全データでtrain
    model.fit(X_for_pseudo, y_pseudo)

    return model.score(X_test, y_test)
def test_fit_with_nm(
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(seed=seed, )
    nm = data['noise_matrix']
    # Learn with noisy labels with noise matrix given
    lnl.fit(data['X_train'], data['s'], noise_matrix=nm)
    score_nm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_nm
    else:
        assert (score < score_nm + 1e-4)
def test_no_fit_sample_weight():
    class Struct():
        def fit(self, X, y):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data['y_test']

    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    psx = np.zeros(shape=(n, m))
    lnl = LearningWithNoisyLabels(clf=Struct())
    lnl.fit(data['X_train'],
            data['y_train'],
            psx=psx,
            noise_matrix=data['noise_matrix'])
    # If we make it here, without any error:
    assert (True)
Exemple #14
0
# original lr f1

print('WITHOUT confident learning,', end=" ")

clf.fit(X_train, s)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (psx not given),', end=" ")
rp = LearningWithNoisyLabels(clf=clf)
rp.fit(X_train, s)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH confident learning (psx given),', end=" ")
rp.fit(X=X_train, s=s, psx=psx)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH all label right,', end=" ")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("-------------------")
rp_score = f1_score(y_test,
Exemple #15
0
def train_without_noisy_labels(X_train, y_train, X_test, y_test, clf=None):
    if clf is None:
        model = baseclf(**params)
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)
Exemple #16
0
def ret_trainedCLclass(X_train, y_train, X_test, y_test):
    model = baseclf(**params)
    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    return clf
Exemple #17
0
# Work around indexing bug
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# A_test = A_test.map({ 0:"female", 1:"male"})

# flip across different groups
Y_noised = flip(Y_train, A_train, error_rate=error_rate)
noise_matrix = generate_noise_matrix(Y_noised, Y_train)
est_error_rate = estimation(X_train.values, Y_noised, A_train.values, ngroups=2**args.ngroups)
print(f"True error rate is {error_rate}.\nEstimated error rate is {est_error_rate}.")

# Learning with Noisy Labels
lnl = LearningWithNoisyLabels(clf=LogisticRegression())
lnl.fit(X=X_train.values, s=Y_noised, noise_matrix=noise_matrix)
Y_lnlt = lnl.predict(X_train.values).astype(int)
lnl.fit(X=X_train.values, s=Y_noised)
Y_lnle = lnl.predict(X_train.values).astype(int)


def run_corrupt(fairness_constraints):
    all_results = {}
    all_results['eps'] = fairness_constraints
    all_results['accuracy'] = {
        'train': [],
        'test': []
    }

    all_results['violation'] = {
        'train': [],
# original lr f1

print('WITHOUT confident learning,', end=" ")

clf.fit(X_train, s)
pred = clf.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (psx not given),', end=" ")
rp = LearningWithNoisyLabels(clf=clf)
rp.fit(X_train, s)
pred = rp.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print('WITH confident learning (psx given),', end=" ")
rp.fit(X=X_train, s=s, psx=psx)
pred = rp.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print('WITH all label right,', end=" ")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print("-------------------")
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
Exemple #19
0
class AutoPUClassifier_cleanlab:
    def __init__(self):
        self.models = []
        self.sample_rto = 4

    def fit(self, X_full, y_full, time_remain):

        start_fit = time.time()
        # SEED = 2019

        # for SEED in range(2019, self.iter + 2019):
        SEED = 2019
        budget = time_remain - (time.time() - start_fit)

        best_iter = []
        while True:
            try:
                print(SEED, budget)
                round_start = time.time()

                self.hyper_seed = SEED
                params = {
                    "objective": "binary",
                    "metric": "auc",
                    "verbosity": -1,
                    "seed": self.hyper_seed,
                    "num_threads": 4,
                    "num_boost_round": 500
                }

                X, y = downsampling(X_full,
                                    y_full,
                                    sum(y_full) * self.sample_rto,
                                    seed=self.hyper_seed)
                # X_sample, y_sample = sample(X, y, 30000, random_state=self.hyper_seed)
                hyperparams = self._hyperopt(X,
                                             y,
                                             params,
                                             random_state=self.hyper_seed)

                X_train, X_val, y_train, y_val = train_test_split(
                    X, y, test_size=0.1, random_state=self.hyper_seed)

                watchlist = [(X_train, y_train), (X_val, y_val)]

                _model = lgb.LGBMClassifier(**hyperparams, **params)
                _model.fit(X_train,
                           y_train,
                           early_stopping_rounds=30,
                           eval_set=watchlist,
                           verbose=100)

                params["num_boost_round"] = _model.best_iteration_
                best_iter.append(_model.best_iteration_)

                confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
                    X=X.values,
                    s=1 * (y.values == 1),
                    clf=lgb.LGBMClassifier(**hyperparams, **params),
                    seed=SEED,
                )

                est_py, est_nm, est_inv = estimate_latent(confident_joint,
                                                          s=1 *
                                                          (y.values == 1))

                self.model = LearningWithNoisyLabels(
                    lgb.LGBMClassifier(**hyperparams, **params),
                    seed=1,
                    cv_n_folds=5,
                    prune_method="both",  # 'prune_by_noise_rate',
                    converge_latent_estimates=True,
                    pulearning=1)

                self.model.fit(
                    X.values,
                    1 * (y.values == 1),
                    psx=psx,
                    thresholds=None,
                    noise_matrix=est_nm,
                    inverse_noise_matrix=est_inv,
                )

                self.models.append(self.model)

                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1
            except:
                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1

        print(best_iter)
        return self

    def predict(self, X, time_remain):
        budget = copy.deepcopy(time_remain)
        round_start = time.time()
        tick = 0

        for idx, model in enumerate(self.models):
            p = model.predict_proba(X)[:, 1]
            if idx == 0:
                prediction = p
            else:
                prediction = np.vstack((prediction, p))

            single_round = (time.time() - round_start) / (idx + 1)
            budget -= (time.time() - round_start)

            if budget <= single_round * 5:
                break
            tick += 1
        # import pdb;pdb.set_trace()
        if tick > 0 and len(self.models) > 1:
            return np.mean(prediction, axis=0)
        else:
            return prediction

    def _hyperopt(self, X, y, params, random_state=1):
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.5, random_state=random_state)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        space = {
            "learning_rate":
            hp.loguniform("learning_rate", np.log(0.01), np.log(0.5)),
            "max_depth":
            hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]),
            "num_leaves":
            hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)),
            "feature_fraction":
            hp.quniform("feature_fraction", 0.5, 1.0, 0.1),
            "bagging_fraction":
            hp.quniform("bagging_fraction", 0.5, 1.0, 0.1),
            "bagging_freq":
            hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)),
            "reg_alpha":
            hp.uniform("reg_alpha", 0, 2),
            "reg_lambda":
            hp.uniform("reg_lambda", 0, 2),
            "min_child_weight":
            hp.uniform('min_child_weight', 0.5, 10),
        }

        def objective(hyperparams):
            model = lgb.train({
                **params,
                **hyperparams
            },
                              train_data,
                              50,
                              valid_data,
                              early_stopping_rounds=30,
                              verbose_eval=0)
            score = model.best_score["valid_0"][params["metric"]]

            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    trials=trials,
                    algo=tpe.suggest,
                    max_evals=10,
                    verbose=1,
                    rstate=np.random.RandomState(1))

        hyperparams = space_eval(space, best)
        log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
        return hyperparams
    print("Plotting is only supported in an iPython interface.")

# In[4]:

print('WITHOUT confident learning,', end=" ")
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
_ = clf.fit(X_train, s)
pred = clf.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix=noise_matrix)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
inv = compute_inv_noise_matrix(py, noise_matrix)
_ = rp.fit(X_train, s, noise_matrix=noise_matrix, inverse_noise_matrix=inv)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print('WITH confident learning noise not given,', end=" ")
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=clf, seed=seed)
_ = rp.fit(X_train, s)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))
Exemple #21
0
def clean_labels(X: pd.DataFrame,
                 y,
                 count_start,
                 pulearning=None,
                 strategy="cut",
                 round=0,
                 early_stop=False):
    count = count_start
    from preprocess import sample
    cols = [
        c for c in X if len(c.split("_")) == 2 and (
            c.startswith("c_") or c.startswith("n_"))
    ]
    print(cols)

    while count <= count_start + round:
        try:
            params = {
                "objective": "binary",
                "metric": "auc",
                "verbosity": -1,
                "seed": count,
                "num_threads": 4,
                "num_boost_round": 50
            }

            X_sample, y_sample = sample(X[cols], y, 30000, random_state=count)
            hyperparams = _hyperopt(X_sample,
                                    y_sample,
                                    params,
                                    random_state=count)
            # confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
            #     X=X.values,
            #     s=1 * (y.values == 1),
            #     clf=lgb.LGBMClassifier(**hyperparams, **params),  # default, you can use any classifier
            #     seed=count,
            # )
            # est_py, est_nm, est_inv = estimate_latent(confident_joint, s=1 * (y.values == 1))

            model = LearningWithNoisyLabels(
                lgb.LGBMClassifier(**hyperparams, **params),
                seed=count,
                cv_n_folds=5,
                prune_method="both",  # 'prune_by_noise_rate',
                converge_latent_estimates=True,
                pulearning=pulearning)
            print(X.shape, len(y))
            # import pdb;pdb.set_trace()
            noisy, noise_matrix, inverse_noise_matrix, confident_joint, psx = model.fit(
                X[cols].values, 1 * (y.values == 1), thresholds=None)
            # noise_matrix=est_nm,
            # inverse_noise_matrix=est_inv, )
            if count == count_start:

                rou_0 = noise_matrix[1, 0]
                rou_1 = noise_matrix[0, 1]

                print(rou_0, rou_1)
                if early_stop and rou_0 + rou_1 <= 0.9:
                    break
            if len(noisy) <= 0:
                break
            print(len([x for x in noisy if x == True]))

            if strategy == "cut":
                X = X[~noisy]
                y = y[~noisy]
            else:
                X = X[~noisy]
                y = y[~noisy]

        except Exception as exp:
            print("error:", exp)
        finally:
            count += 1

    return X, y, rou_0 + rou_1, rou_0, rou_1
Exemple #22
0
    for ratio in ratioList:
        clf = RandomForestClassifier()
        clfNoise = LearningWithNoisyLabels(clf=RandomForestClassifier())
        newTrainX = trainX
        newTrainY = copy.deepcopy(trainY)
        for i in range(len(newTrainX)):
            if (random.random() < ratio):
                while True:
                    noiseLabel = random.randint(1, 4) - 1
                    # print('trainY[i] : ', newTrainY[i], 'noiseLabel :',noiseLabel)
                    if newTrainY[i] != noiseLabel:
                        newTrainY[i] = noiseLabel
                        break

        clf.fit(newTrainX, newTrainY)
        clfNoise.fit(newTrainX, newTrainY)
        # importances = clf.feature_importances_
        # indices = np.argsort(importances)[::-1]
        # for f in range(trainX.shape[1]):
        #     print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

        scores = getResult(
            testX, testY, clf,
            '普通-' + str(ratio))  #accscore, aucscore, recallsocre, f1score
        noiseScores = getResult(testX, testY, clfNoise, '置信学习-' + str(ratio))
        scoreList.append(scores)
        noiseScoreist.append(noiseScores)
    scoreList = np.array(scoreList)
    noiseScoreist = np.array(noiseScoreist)
    titleList = ['accscore', 'aucscore', 'recallsocre', 'f1score']
    for i in range(len(scoreList[0])):
# In[19]:




print('WITHOUT confident learning,', end=" ")
clf = logreg()
_ = clf.fit(X_train, s)
pred = clf.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print("\nNow we show the improvement using confident learning to characterize the noise")
print("and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix = noise_matrix)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix = noise_matrix, inverse_noise_matrix=compute_inv_noise_matrix(py, noise_matrix))
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using latent noise matrix estimation),', end=" ")
rp = LearningWithNoisyLabels(clf = logreg(), seed = seed, prune_count_method='inverse_nm_dot_s')
_ = rp.fit(X_train, s)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using calibrated confident joint),', end=" ")
        # Create four copies of the classifier.
        # perf_label_clf - Will be trained on the hidden, noise-free labels
        # noisy_clf - Will be trained on the noisy labels
        # noisy_clf_w_rp - Will be trained on the noisy labels using LearningWithNoisyLabels

        clfs = [copy.deepcopy(clf) for i in range(len(experiments))]
        perf_label_clf, noisy_clf, noisy_clf_w_rp = clfs
        # Classifier (trained without label errors)
        perf_label_clf.fit(X_train, y_train)
        perf_label_score = perf_label_clf.score(X_test, y_test)
        # Classifier (trained with label errors)
        noisy_clf.fit(X_train, y_train_w_errors)
        noisy_score = noisy_clf.score(X_test, y_test)
        # Classifier + RP (trained with label errors)
        rp = LearningWithNoisyLabels(noisy_clf_w_rp)
        rp.fit(X_train, y_train_w_errors)
        noisy_score_w_rp = rp.clf.score(X_test, y_test)

        # Store results for each classifier in a dict with key = clf_name.
        clf_results[name] = {
            'clfs': clfs,
            "perf_label_score": perf_label_score,
            "noisy_score": noisy_score,
            "noisy_score_w_rp": noisy_score_w_rp,
        }

    results.append({
        "X": X,
        "X_train": X_train,
        "y_train": y_train,
        "y_train_w_errors": y_train_w_errors,
Exemple #25
0
def denoiseA(data_cor, rho, mode):
    '''
    Denoise the corrupted sensitive attribute using RankPrune.
    '''

    rho_a_plus, rho_a_minus = rho

    dataX = data_cor[0]
    cor_dataA = data_cor[2]
    # dataA = data_cor[5]
    #
    # auc3, auc4 = None, None

    noise_matrix = np.array([[1 - rho_a_minus, rho_a_plus],
                             [rho_a_minus, 1 - rho_a_plus]])
    # noise_matrix = None

    lnl = LearningWithNoisyLabels(clf=LogisticRegression(
        random_state=0, solver='lbfgs', multi_class='auto'))
    lnl.fit(X=dataX.values, s=cor_dataA.values, noise_matrix=noise_matrix)

    # Logistic Regression Baseline
    # lnl = clf=LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
    # lnl.fit(X = dataX.values, y = cor_dataA.values)

    denoised_dataA = pd.Series(lnl.predict(dataX.values))
    data_denoised = copy.deepcopy(data_cor)
    data_denoised[2] = denoised_dataA

    # print(lnl.noise_matrix, rho_a_plus, rho_a_minus)

    # Check recovery accuracy
    # auc1 = np.mean(dataA.values==cor_dataA.values)
    # auc2 = np.mean(dataA.values==denoised_dataA.values)

    # The following is under development.
    rho_est = None
    data_denoised_est = None

    if mode == 'six':

        lnl2 = LearningWithNoisyLabels(
            LogisticRegression(random_state=0,
                               solver='lbfgs',
                               multi_class='auto'))
        lnl2.fit(X=dataX.values, s=cor_dataA.values)

        denoised_dataA_est = pd.Series(lnl2.predict(dataX.values))
        data_denoised_est = copy.deepcopy(data_cor)
        data_denoised_est[2] = denoised_dataA_est

        rho_a_plus_est = lnl2.noise_matrix[0][1]
        rho_a_minus_est = lnl2.noise_matrix[1][0]
        rho_est = [rho_a_plus_est, rho_a_minus_est]

        # print(lnl2.noise_matrix, rho_a_plus_est, rho_a_minus_est)

        # lnl3 = LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
        # lnl3.fit(dataX.values, cor_dataA.values)

        # pred_dataA = pd.Series(lnl3.predict(dataX.values))
        # auc3 = np.mean(dataA.values==denoised_dataA_est.values)
        # auc4 = np.mean(dataA.values==pred_dataA.values)

    # print('auc:', auc1, auc2, auc3, auc4)

    return data_denoised, data_denoised_est, rho_est
Exemple #26
0
print('The actual, latent, underlying noise matrix.')
print_noise_matrix(noise_matrix)
print('Our estimate of the noise matrix.')
print_noise_matrix(est_noise_matrix)
print()
print('The actual, latent, underlying joint distribution matrix.')
cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors)
print('Our estimate of the joint distribution matrix.')
cleanlab.util.print_joint_matrix(est_joint)
print("Accuracy Comparison")
print("-------------------")
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))
print("Logistic regression:", baseline_score)
rp = LearningWithNoisyLabels(seed=seed)
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
print("Logistic regression (+rankpruning):", rp_score)
diff = rp_score - baseline_score
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
print(
    'Fit on denoised data without re-weighting:',
    accuracy_score(
        y_test,
        clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test)))

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    print("\n\n\n\n\n\n")
Exemple #27
0
        parameter.return_path('GodClass', p[-1]))
    import random
    from multiprocessing import cpu_count

    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
    from cleanlab.classification import LearningWithNoisyLabels
    index = pd.read_csv(parameter.return_path(
        'GodCLass', p[-1])).columns.values[0].split('.')
    df = pd.read_csv(parameter.return_path('GodClass', p[-1]),
                     sep=',',
                     skiprows=1,
                     header=None,
                     names=index,
                     quoting=3).drop(columns='name').replace('(.*)"(.*)',
                                                             r'\1\2',
                                                             regex=True)
    model = GaussianNB()
    clf = LearningWithNoisyLabels(clf=model, seed=1, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    # X_with_noise = df[:][clf.noise_mask]
    print(type(clf.noise_mask))
    print(np.where(clf.noise_mask == True))
    print('\n\n')
    model = LogisticRegression(multi_class='auto')
    clf = LearningWithNoisyLabels(clf=model, seed=1, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    # X_with_noise2 = df[:][clf.noise_mask]
    print(np.where(clf.noise_mask == True))
#####