Python LearningWithNoisyLabels.fit Exemples, cleanlab.classification.LearningWithNoisyLabels.fit Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_classification.py Projet : valeman/cleanlab

def test_fit_with_inm(
    sparse,
    seed=SEED,
    used_by_another_test=False,
):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=seed, )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)

Exemple #2

0

Afficher le fichier

def __model_build_noisy(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')
    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

Exemple #3

0

Afficher le fichier

Fichier : test_classification.py Projet : valeman/cleanlab

def test_clf_fit_nm_inm(sparse):
    data = SPARSE_DATA if sparse else DATA
    lnl = LearningWithNoisyLabels(seed=SEED)
    nm = data['noise_matrix']
    inm = compute_inv_noise_matrix(
        data["py"],
        nm,
        data["ps"],
    )
    lnl.fit(
        X=data['X_train'],
        s=data['s'],
        noise_matrix=nm,
        inverse_noise_matrix=inm,
    )
    score_nm_inm = lnl.score(data['X_test'], data['y_test'])

    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=SEED)
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    assert (score < score_nm_inm + 1e-4)

Exemple #4

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_fit_with_inm(
    prune_count_method='inverse_nm_dot_s',
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    inm = compute_inv_noise_matrix(
        data["py"],
        data["noise_matrix"],
        data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    lnl.fit(data['X_train'], data['s'], inverse_noise_matrix=inm)
    score_inm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the inv noise matrix.
    lnl2 = LearningWithNoisyLabels(
        seed=seed,
        prune_count_method=prune_count_method,
    )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_inm
    else:
        assert (score < score_inm + 1e-4)

Exemple #5

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_rp():
    rp = LearningWithNoisyLabels(clf=LogisticRegression(
        multi_class='auto', solver='lbfgs', random_state=seed))
    rp.fit(data["X_train"], data["s"])
    score = rp.score(data["X_test"], data["y_test"])
    print(score)
    # Check that this runs without error.
    assert (True)

Exemple #6

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_clf_fit_nm():
    lnl = LearningWithNoisyLabels()
    # Example of a bad noise matrix (impossible to learn from)
    nm = np.array([[0, 1], [1, 0]])
    try:
        lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)
    except Exception as e:
        assert ('Trace(noise_matrix)' in str(e))
        with pytest.raises(ValueError) as e:
            lnl.fit(X=np.arange(3), s=np.array([0, 0, 1]), noise_matrix=nm)

Exemple #7

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_pred_and_pred_proba():
    lnl = LearningWithNoisyLabels()
    lnl.fit(data['X_train'], data['s'])
    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    pred = lnl.predict(data['X_test'])
    probs = lnl.predict_proba(data['X_test'])
    # Just check that this functions return what we expect
    assert (np.shape(pred)[0] == n)
    assert (np.shape(probs) == (n, m))

Exemple #8

0

Afficher le fichier

Fichier : experiments.py Projet : masakiaota/blog

def train_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedにだけpseudo
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    # きれいにしたtrain dataでtrain
    model.fit(X_train, y_train_pseudo)

    return model.score(X_test, y_test)

Exemple #9

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_fit_psx():
    from cleanlab.latent_estimation import estimate_cv_predicted_probabilities
    lnl = LearningWithNoisyLabels()
    psx = estimate_cv_predicted_probabilities(
        X=data['X_train'],
        labels=data['y_train'],
    )
    lnl.fit(X=data['X_train'], s=data['y_train'], psx=psx)
    score_with_psx = lnl.score(data['X_test'], data['y_test'])
    lnl = LearningWithNoisyLabels()
    lnl.fit(
        X=data['X_train'],
        s=data['y_train'],
    )
    score_no_psx = lnl.score(data['X_test'], data['y_test'])
    assert (abs(score_with_psx - score_no_psx) < 1e-6)

Exemple #10

0

Afficher le fichier

def __model_build_noisy_pseudo(X_train, y_train, X_test, alg, seed):
    model = GaussianNB()
    if alg == 'Logistic':
        model = LogisticRegression(multi_class='auto')

    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)

    # Pseudo-labelling
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_pseudo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_pseudo])
    X_for_pseudo = np.vstack([X_train, X_test])
    model.fit(X_for_pseudo, y_pseudo)
    return model.predict(X_test)

Exemple #11

0

Afficher le fichier

Fichier : experiments.py Projet : masakiaota/blog

def train_test_and_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None):
    model = baseclf(**params)
    if clf is None:
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)

    # trainのcorruptedとtestの両方をpseudoにする
    X_with_noise = X_train[clf.noise_mask]
    y_train_pseudo = y_train_corrupted.copy()
    y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise)
    y_test_psuedo = clf.predict(X_test)
    y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo])
    X_for_pseudo = sp.vstack([X_train, X_test])

    # pseudo込の全データでtrain
    model.fit(X_for_pseudo, y_pseudo)

    return model.score(X_test, y_test)

Exemple #12

0

Afficher le fichier

Fichier : test_classification.py Projet : woshifzh/cleanlab

def test_fit_with_nm(
    seed=0,
    used_by_another_test=False,
):
    lnl = LearningWithNoisyLabels(seed=seed, )
    nm = data['noise_matrix']
    # Learn with noisy labels with noise matrix given
    lnl.fit(data['X_train'], data['s'], noise_matrix=nm)
    score_nm = lnl.score(data['X_test'], data['y_test'])
    # Learn with noisy labels and estimate the noise matrix.
    lnl2 = LearningWithNoisyLabels(seed=seed, )
    lnl2.fit(
        data['X_train'],
        data['s'],
    )
    score = lnl2.score(data['X_test'], data['y_test'])
    if used_by_another_test:
        return score, score_nm
    else:
        assert (score < score_nm + 1e-4)

Exemple #13

0

Afficher le fichier

Fichier : test_classification.py Projet : zhongkailv/cleanlab

def test_no_fit_sample_weight():
    class Struct():
        def fit(self, X, y):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data['y_test']

    n = np.shape(data['y_test'])[0]
    m = len(np.unique(data['y_test']))
    psx = np.zeros(shape=(n, m))
    lnl = LearningWithNoisyLabels(clf=Struct())
    lnl.fit(data['X_train'],
            data['y_train'],
            psx=psx,
            noise_matrix=data['noise_matrix'])
    # If we make it here, without any error:
    assert (True)

Exemple #14

0

Afficher le fichier

# original lr f1

print('WITHOUT confident learning,', end=" ")

clf.fit(X_train, s)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (psx not given),', end=" ")
rp = LearningWithNoisyLabels(clf=clf)
rp.fit(X_train, s)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH confident learning (psx given),', end=" ")
rp.fit(X=X_train, s=s, psx=psx)
pred = rp.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print('WITH all label right,', end=" ")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))

print("-------------------")
rp_score = f1_score(y_test,

Exemple #15

0

Afficher le fichier

Fichier : experiments.py Projet : masakiaota/blog

def train_without_noisy_labels(X_train, y_train, X_test, y_test, clf=None):
    if clf is None:
        model = baseclf(**params)
        clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
        clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

Exemple #16

0

Afficher le fichier

Fichier : experiments.py Projet : masakiaota/blog

def ret_trainedCLclass(X_train, y_train, X_test, y_test):
    model = baseclf(**params)
    clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    return clf

Exemple #17

0

Afficher le fichier

# Work around indexing bug
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# A_test = A_test.map({ 0:"female", 1:"male"})

# flip across different groups
Y_noised = flip(Y_train, A_train, error_rate=error_rate)
noise_matrix = generate_noise_matrix(Y_noised, Y_train)
est_error_rate = estimation(X_train.values, Y_noised, A_train.values, ngroups=2**args.ngroups)
print(f"True error rate is {error_rate}.\nEstimated error rate is {est_error_rate}.")

# Learning with Noisy Labels
lnl = LearningWithNoisyLabels(clf=LogisticRegression())
lnl.fit(X=X_train.values, s=Y_noised, noise_matrix=noise_matrix)
Y_lnlt = lnl.predict(X_train.values).astype(int)
lnl.fit(X=X_train.values, s=Y_noised)
Y_lnle = lnl.predict(X_train.values).astype(int)


def run_corrupt(fairness_constraints):
    all_results = {}
    all_results['eps'] = fairness_constraints
    all_results['accuracy'] = {
        'train': [],
        'test': []
    }

    all_results['violation'] = {
        'train': [],

Exemple #18

0

Afficher le fichier

Fichier : 03cleanlib_retrain_model_isri_demo.py Projet : wujiuhsu/python-tutorial

# original lr f1

print('WITHOUT confident learning,', end=" ")

clf.fit(X_train, s)
pred = clf.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (psx not given),', end=" ")
rp = LearningWithNoisyLabels(clf=clf)
rp.fit(X_train, s)
pred = rp.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print('WITH confident learning (psx given),', end=" ")
rp.fit(X=X_train, s=s, psx=psx)
pred = rp.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print('WITH all label right,', end=" ")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))

print("-------------------")
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))

Exemple #19

0

Afficher le fichier

class AutoPUClassifier_cleanlab:
    def __init__(self):
        self.models = []
        self.sample_rto = 4

    def fit(self, X_full, y_full, time_remain):

        start_fit = time.time()
        # SEED = 2019

        # for SEED in range(2019, self.iter + 2019):
        SEED = 2019
        budget = time_remain - (time.time() - start_fit)

        best_iter = []
        while True:
            try:
                print(SEED, budget)
                round_start = time.time()

                self.hyper_seed = SEED
                params = {
                    "objective": "binary",
                    "metric": "auc",
                    "verbosity": -1,
                    "seed": self.hyper_seed,
                    "num_threads": 4,
                    "num_boost_round": 500
                }

                X, y = downsampling(X_full,
                                    y_full,
                                    sum(y_full) * self.sample_rto,
                                    seed=self.hyper_seed)
                # X_sample, y_sample = sample(X, y, 30000, random_state=self.hyper_seed)
                hyperparams = self._hyperopt(X,
                                             y,
                                             params,
                                             random_state=self.hyper_seed)

                X_train, X_val, y_train, y_val = train_test_split(
                    X, y, test_size=0.1, random_state=self.hyper_seed)

                watchlist = [(X_train, y_train), (X_val, y_val)]

                _model = lgb.LGBMClassifier(**hyperparams, **params)
                _model.fit(X_train,
                           y_train,
                           early_stopping_rounds=30,
                           eval_set=watchlist,
                           verbose=100)

                params["num_boost_round"] = _model.best_iteration_
                best_iter.append(_model.best_iteration_)

                confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
                    X=X.values,
                    s=1 * (y.values == 1),
                    clf=lgb.LGBMClassifier(**hyperparams, **params),
                    seed=SEED,
                )

                est_py, est_nm, est_inv = estimate_latent(confident_joint,
                                                          s=1 *
                                                          (y.values == 1))

                self.model = LearningWithNoisyLabels(
                    lgb.LGBMClassifier(**hyperparams, **params),
                    seed=1,
                    cv_n_folds=5,
                    prune_method="both",  # 'prune_by_noise_rate',
                    converge_latent_estimates=True,
                    pulearning=1)

                self.model.fit(
                    X.values,
                    1 * (y.values == 1),
                    psx=psx,
                    thresholds=None,
                    noise_matrix=est_nm,
                    inverse_noise_matrix=est_inv,
                )

                self.models.append(self.model)

                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1
            except:
                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if budget <= single_round * 5:
                    break

                SEED += 1

        print(best_iter)
        return self

    def predict(self, X, time_remain):
        budget = copy.deepcopy(time_remain)
        round_start = time.time()
        tick = 0

        for idx, model in enumerate(self.models):
            p = model.predict_proba(X)[:, 1]
            if idx == 0:
                prediction = p
            else:
                prediction = np.vstack((prediction, p))

            single_round = (time.time() - round_start) / (idx + 1)
            budget -= (time.time() - round_start)

            if budget <= single_round * 5:
                break
            tick += 1
        # import pdb;pdb.set_trace()
        if tick > 0 and len(self.models) > 1:
            return np.mean(prediction, axis=0)
        else:
            return prediction

    def _hyperopt(self, X, y, params, random_state=1):
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.5, random_state=random_state)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        space = {
            "learning_rate":
            hp.loguniform("learning_rate", np.log(0.01), np.log(0.5)),
            "max_depth":
            hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]),
            "num_leaves":
            hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)),
            "feature_fraction":
            hp.quniform("feature_fraction", 0.5, 1.0, 0.1),
            "bagging_fraction":
            hp.quniform("bagging_fraction", 0.5, 1.0, 0.1),
            "bagging_freq":
            hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)),
            "reg_alpha":
            hp.uniform("reg_alpha", 0, 2),
            "reg_lambda":
            hp.uniform("reg_lambda", 0, 2),
            "min_child_weight":
            hp.uniform('min_child_weight', 0.5, 10),
        }

        def objective(hyperparams):
            model = lgb.train({
                **params,
                **hyperparams
            },
                              train_data,
                              50,
                              valid_data,
                              early_stopping_rounds=30,
                              verbose_eval=0)
            score = model.best_score["valid_0"][params["metric"]]

            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    trials=trials,
                    algo=tpe.suggest,
                    max_evals=10,
                    verbose=1,
                    rstate=np.random.RandomState(1))

        hyperparams = space_eval(space, best)
        log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
        return hyperparams

Exemple #20

0

Afficher le fichier

Fichier : iris_simple_example.py Projet : zwzhu-d/cleanlab

    print("Plotting is only supported in an iPython interface.")

# In[4]:

print('WITHOUT confident learning,', end=" ")
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
_ = clf.fit(X_train, s)
pred = clf.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print("\nNow we show improvement using cleanlab to characterize the noise")
print(
    "and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix=noise_matrix)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
inv = compute_inv_noise_matrix(py, noise_matrix)
_ = rp.fit(X_train, s, noise_matrix=noise_matrix, inverse_noise_matrix=inv)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print('WITH confident learning noise not given,', end=" ")
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=clf, seed=seed)
_ = rp.fit(X_train, s)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

Exemple #21

0

Afficher le fichier

def clean_labels(X: pd.DataFrame,
                 y,
                 count_start,
                 pulearning=None,
                 strategy="cut",
                 round=0,
                 early_stop=False):
    count = count_start
    from preprocess import sample
    cols = [
        c for c in X if len(c.split("_")) == 2 and (
            c.startswith("c_") or c.startswith("n_"))
    ]
    print(cols)

    while count <= count_start + round:
        try:
            params = {
                "objective": "binary",
                "metric": "auc",
                "verbosity": -1,
                "seed": count,
                "num_threads": 4,
                "num_boost_round": 50
            }

            X_sample, y_sample = sample(X[cols], y, 30000, random_state=count)
            hyperparams = _hyperopt(X_sample,
                                    y_sample,
                                    params,
                                    random_state=count)
            # confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
            #     X=X.values,
            #     s=1 * (y.values == 1),
            #     clf=lgb.LGBMClassifier(**hyperparams, **params),  # default, you can use any classifier
            #     seed=count,
            # )
            # est_py, est_nm, est_inv = estimate_latent(confident_joint, s=1 * (y.values == 1))

            model = LearningWithNoisyLabels(
                lgb.LGBMClassifier(**hyperparams, **params),
                seed=count,
                cv_n_folds=5,
                prune_method="both",  # 'prune_by_noise_rate',
                converge_latent_estimates=True,
                pulearning=pulearning)
            print(X.shape, len(y))
            # import pdb;pdb.set_trace()
            noisy, noise_matrix, inverse_noise_matrix, confident_joint, psx = model.fit(
                X[cols].values, 1 * (y.values == 1), thresholds=None)
            # noise_matrix=est_nm,
            # inverse_noise_matrix=est_inv, )
            if count == count_start:

                rou_0 = noise_matrix[1, 0]
                rou_1 = noise_matrix[0, 1]

                print(rou_0, rou_1)
                if early_stop and rou_0 + rou_1 <= 0.9:
                    break
            if len(noisy) <= 0:
                break
            print(len([x for x in noisy if x == True]))

            if strategy == "cut":
                X = X[~noisy]
                y = y[~noisy]
            else:
                X = X[~noisy]
                y = y[~noisy]

        except Exception as exp:
            print("error:", exp)
        finally:
            count += 1

    return X, y, rou_0 + rou_1, rou_0, rou_1

Exemple #22

0

Afficher le fichier

Fichier : PredictPanel.py Projet : FakeNewss/OCRDetector

    for ratio in ratioList:
        clf = RandomForestClassifier()
        clfNoise = LearningWithNoisyLabels(clf=RandomForestClassifier())
        newTrainX = trainX
        newTrainY = copy.deepcopy(trainY)
        for i in range(len(newTrainX)):
            if (random.random() < ratio):
                while True:
                    noiseLabel = random.randint(1, 4) - 1
                    # print('trainY[i] : ', newTrainY[i], 'noiseLabel :',noiseLabel)
                    if newTrainY[i] != noiseLabel:
                        newTrainY[i] = noiseLabel
                        break

        clf.fit(newTrainX, newTrainY)
        clfNoise.fit(newTrainX, newTrainY)
        # importances = clf.feature_importances_
        # indices = np.argsort(importances)[::-1]
        # for f in range(trainX.shape[1]):
        #     print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

        scores = getResult(
            testX, testY, clf,
            '普通-' + str(ratio))  #accscore, aucscore, recallsocre, f1score
        noiseScores = getResult(testX, testY, clfNoise, '置信学习-' + str(ratio))
        scoreList.append(scores)
        noiseScoreist.append(noiseScores)
    scoreList = np.array(scoreList)
    noiseScoreist = np.array(noiseScoreist)
    titleList = ['accscore', 'aucscore', 'recallsocre', 'f1score']
    for i in range(len(scoreList[0])):

Exemple #23

0

Afficher le fichier

Fichier : iris_simple_example.py Projet : zhongkailv/cleanlab

# In[19]:




print('WITHOUT confident learning,', end=" ")
clf = logreg()
_ = clf.fit(X_train, s)
pred = clf.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))

print("\nNow we show the improvement using confident learning to characterize the noise")
print("and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix = noise_matrix)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
_ = rp.fit(X_train, s, noise_matrix = noise_matrix, inverse_noise_matrix=compute_inv_noise_matrix(py, noise_matrix))
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using latent noise matrix estimation),', end=" ")
rp = LearningWithNoisyLabels(clf = logreg(), seed = seed, prune_count_method='inverse_nm_dot_s')
_ = rp.fit(X_train, s)
pred = rp.predict(X_test)
print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using calibrated confident joint),', end=" ")

Exemple #24

0

Afficher le fichier

Fichier : classifier_comparison.py Projet : zhongkailv/cleanlab

        # Create four copies of the classifier.
        # perf_label_clf - Will be trained on the hidden, noise-free labels
        # noisy_clf - Will be trained on the noisy labels
        # noisy_clf_w_rp - Will be trained on the noisy labels using LearningWithNoisyLabels

        clfs = [copy.deepcopy(clf) for i in range(len(experiments))]
        perf_label_clf, noisy_clf, noisy_clf_w_rp = clfs
        # Classifier (trained without label errors)
        perf_label_clf.fit(X_train, y_train)
        perf_label_score = perf_label_clf.score(X_test, y_test)
        # Classifier (trained with label errors)
        noisy_clf.fit(X_train, y_train_w_errors)
        noisy_score = noisy_clf.score(X_test, y_test)
        # Classifier + RP (trained with label errors)
        rp = LearningWithNoisyLabels(noisy_clf_w_rp)
        rp.fit(X_train, y_train_w_errors)
        noisy_score_w_rp = rp.clf.score(X_test, y_test)

        # Store results for each classifier in a dict with key = clf_name.
        clf_results[name] = {
            'clfs': clfs,
            "perf_label_score": perf_label_score,
            "noisy_score": noisy_score,
            "noisy_score_w_rp": noisy_score_w_rp,
        }

    results.append({
        "X": X,
        "X_train": X_train,
        "y_train": y_train,
        "y_train_w_errors": y_train_w_errors,

Exemple #25

0

Afficher le fichier

Fichier : util.py Projet : Peggyzz/noise_fairlearn

def denoiseA(data_cor, rho, mode):
    '''
    Denoise the corrupted sensitive attribute using RankPrune.
    '''

    rho_a_plus, rho_a_minus = rho

    dataX = data_cor[0]
    cor_dataA = data_cor[2]
    # dataA = data_cor[5]
    #
    # auc3, auc4 = None, None

    noise_matrix = np.array([[1 - rho_a_minus, rho_a_plus],
                             [rho_a_minus, 1 - rho_a_plus]])
    # noise_matrix = None

    lnl = LearningWithNoisyLabels(clf=LogisticRegression(
        random_state=0, solver='lbfgs', multi_class='auto'))
    lnl.fit(X=dataX.values, s=cor_dataA.values, noise_matrix=noise_matrix)

    # Logistic Regression Baseline
    # lnl = clf=LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
    # lnl.fit(X = dataX.values, y = cor_dataA.values)

    denoised_dataA = pd.Series(lnl.predict(dataX.values))
    data_denoised = copy.deepcopy(data_cor)
    data_denoised[2] = denoised_dataA

    # print(lnl.noise_matrix, rho_a_plus, rho_a_minus)

    # Check recovery accuracy
    # auc1 = np.mean(dataA.values==cor_dataA.values)
    # auc2 = np.mean(dataA.values==denoised_dataA.values)

    # The following is under development.
    rho_est = None
    data_denoised_est = None

    if mode == 'six':

        lnl2 = LearningWithNoisyLabels(
            LogisticRegression(random_state=0,
                               solver='lbfgs',
                               multi_class='auto'))
        lnl2.fit(X=dataX.values, s=cor_dataA.values)

        denoised_dataA_est = pd.Series(lnl2.predict(dataX.values))
        data_denoised_est = copy.deepcopy(data_cor)
        data_denoised_est[2] = denoised_dataA_est

        rho_a_plus_est = lnl2.noise_matrix[0][1]
        rho_a_minus_est = lnl2.noise_matrix[1][0]
        rho_est = [rho_a_plus_est, rho_a_minus_est]

        # print(lnl2.noise_matrix, rho_a_plus_est, rho_a_minus_est)

        # lnl3 = LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto')
        # lnl3.fit(dataX.values, cor_dataA.values)

        # pred_dataA = pd.Series(lnl3.predict(dataX.values))
        # auc3 = np.mean(dataA.values==denoised_dataA_est.values)
        # auc4 = np.mean(dataA.values==pred_dataA.values)

    # print('auc:', auc1, auc2, auc3, auc4)

    return data_denoised, data_denoised_est, rho_est

Exemple #26

0

Afficher le fichier

print('The actual, latent, underlying noise matrix.')
print_noise_matrix(noise_matrix)
print('Our estimate of the noise matrix.')
print_noise_matrix(est_noise_matrix)
print()
print('The actual, latent, underlying joint distribution matrix.')
cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors)
print('Our estimate of the joint distribution matrix.')
cleanlab.util.print_joint_matrix(est_joint)
print("Accuracy Comparison")
print("-------------------")
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))
print("Logistic regression:", baseline_score)
rp = LearningWithNoisyLabels(seed=seed)
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
print("Logistic regression (+rankpruning):", rp_score)
diff = rp_score - baseline_score
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
print(
    'Fit on denoised data without re-weighting:',
    accuracy_score(
        y_test,
        clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test)))

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    print("\n\n\n\n\n\n")

Exemple #27

0

Afficher le fichier

        parameter.return_path('GodClass', p[-1]))
    import random
    from multiprocessing import cpu_count

    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
    from cleanlab.classification import LearningWithNoisyLabels
    index = pd.read_csv(parameter.return_path(
        'GodCLass', p[-1])).columns.values[0].split('.')
    df = pd.read_csv(parameter.return_path('GodClass', p[-1]),
                     sep=',',
                     skiprows=1,
                     header=None,
                     names=index,
                     quoting=3).drop(columns='name').replace('(.*)"(.*)',
                                                             r'\1\2',
                                                             regex=True)
    model = GaussianNB()
    clf = LearningWithNoisyLabels(clf=model, seed=1, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    # X_with_noise = df[:][clf.noise_mask]
    print(type(clf.noise_mask))
    print(np.where(clf.noise_mask == True))
    print('\n\n')
    model = LogisticRegression(multi_class='auto')
    clf = LearningWithNoisyLabels(clf=model, seed=1, n_jobs=cpu_count())
    clf.fit(X_train, y_train)
    # X_with_noise2 = df[:][clf.noise_mask]
    print(np.where(clf.noise_mask == True))
#####