コード例 #1
0
def test_trace_less_than_1_error(trace = 0.5):
    try:
        noise_generation.generate_noise_matrix_from_trace(3, trace)
    except ValueError as e:
        assert('trace > 1' in str(e))
        with pytest.raises(ValueError) as e:
            noise_generation.generate_noise_matrix_from_trace(3, trace)
コード例 #2
0
def test_one_class_error():
    try:
        noise_generation.generate_noise_matrix_from_trace(
            K = 1, 
            trace = 2,
        )
    except ValueError as e:
        assert('must be >= 2' in str(e))
        with pytest.raises(ValueError) as e:            
            noise_generation.generate_noise_matrix_from_trace(
                K = 1, 
                trace = 1,
            )
コード例 #3
0
def test_valid_no_py_error():
    try:
        noise_generation.generate_noise_matrix_from_trace(
            K = 3, 
            trace = 2,
            valid_noise_matrix = True,
        )
    except ValueError as e:
        assert('py must be' in str(e))
        with pytest.raises(ValueError) as e:            
            noise_generation.generate_noise_matrix_from_trace(
                K = 3, 
                trace = 2,
                valid_noise_matrix = True,
            )
コード例 #4
0
def test_main_pipeline(
    verbose = False,
    n = 10,
    valid_noise_matrix = True,
    frac_zero_noise_rates = 0,
):
    trace = 1.5
    py = [0.1, 0.1, 0.2, 0.6]
    K = len(py)
    y = [z for i,p in enumerate(py) for z in [i]*int(p*n)]
    nm = noise_generation.generate_noise_matrix_from_trace(
        K = K,
        trace = trace,
        py = py,
        seed = 0,
        valid_noise_matrix = valid_noise_matrix,
        frac_zero_noise_rates = frac_zero_noise_rates,
    )
    # Check that trace is what its supposed to be
    assert(abs(trace - np.trace(nm) < 1e-2))
    # Check that sum of probabilities is K
    assert(abs(nm.sum() - K) < 1e-4)
    # Check that sum of each column is 1
    assert(all(abs(nm.sum(axis = 0) - 1) < 1e-4))
    # Check that joint sums to 1.
    assert(abs(np.sum(nm*py) - 1 < 1e-4))
    s = noise_generation.generate_noisy_labels(y, nm, verbose)
    assert(noise_generation.noise_matrix_is_valid(nm, py, verbose))
コード例 #5
0
def make_data(
        means=[[3, 2], [7, 7], [0, 8]],
        covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]],
        sizes=[800, 400, 400],
        avg_trace=0.8,
        seed=0,  # set to None for non-reproducible randomness
):

    np.random.seed(seed=seed)

    K = len(means)  # number of classes
    data = []
    labels = []
    test_data = []
    test_labels = []

    for idx in range(K):
        data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        test_data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        labels.append(np.array([idx for i in range(sizes[idx])]))
        test_labels.append(np.array([idx for i in range(sizes[idx])]))
    X_train = np.vstack(data)
    y_train = np.hstack(labels)
    X_test = np.vstack(test_data)
    y_test = np.hstack(test_labels)

    # Compute p(y=k)
    py = np.bincount(y_train) / float(len(y_train))

    noise_matrix = generate_noise_matrix_from_trace(
        K,
        trace=avg_trace * K,
        py=py,
        valid_noise_matrix=True,
        seed=seed,
    )

    # Generate our noisy labels using the noise_marix.
    s = generate_noisy_labels(y_train, noise_matrix)
    ps = np.bincount(s) / float(len(s))

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "s": s,
        "ps": ps,
        "py": py,
        "noise_matrix": noise_matrix,
    }
コード例 #6
0
def test_two_class_nofraczero():
    trace = 1.1
    nm = noise_generation.generate_noise_matrix_from_trace(
        K = 2, 
        trace = trace,
        valid_noise_matrix = True,
    )
    assert(not np.any(nm == 0)) # Make sure there is not a zero noise rate.
    assert(abs(trace - np.trace(nm) < 1e-2))
コード例 #7
0
def test_two_class_fraczero_high(valid = False):
    trace = 1.8
    frac_zero_noise_rates = 0.75
    nm = noise_generation.generate_noise_matrix_from_trace(
        K = 2, 
        trace = trace,
        valid_noise_matrix = valid,
        frac_zero_noise_rates = frac_zero_noise_rates,
    )
    assert(np.any(nm == 0)) # Make sure there is a zero noise rate.
    assert(abs(trace - np.trace(nm) < 1e-2))
コード例 #8
0
def test_max_iter(): 
    trace = 2
    K = 3
    py = [1/float(K)]*K
    nm = noise_generation.generate_noise_matrix_from_trace(
        K = K, 
        trace = trace,
        valid_noise_matrix = True,
        max_iter = 1,
        py = py,
        seed = 1,
    )
    assert(abs(np.trace(nm) - trace) < 1e-6)
    assert(abs(sum(np.dot(nm, py)) - 1) < 1e-6)
    nm2 = noise_generation.generate_noise_matrix_from_trace(
        K = 3, 
        trace = trace,
        valid_noise_matrix = True,
        py = [0.1, 0.1, 0.8],
        max_iter = 0,
    )
    assert(nm2 == False)
コード例 #9
0
def add_outlier(y_train):
    seed = random.randint(0, 1000)
    a, py = np.unique(y_train, return_counts=True)
    noise_matrix = generate_noise_matrix_from_trace(2,
                                                    1.95,
                                                    min_trace_prob=0.15,
                                                    py=py,
                                                    seed=seed)
    np.random.seed(seed)
    y_train_corrupted = generate_noisy_labels(y_train, noise_matrix)
    y_train_is_error = y_train_corrupted != y_train
    n = y_train_is_error.sum()
    return y_train_corrupted, int(n / len(y_train) * 100)
コード例 #10
0
        GaussianNB(),
    ),
    (
        "Logistic Regression",
        LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto'),
    ),
]:
    print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name))
    np.random.seed(seed=0)
    clf_copy = copy.deepcopy(clf)
    # Compute p(y=k), the ground truth class prior on the labels.
    py = np.bincount(y_train) / float(len(y_train))
    # Generate the noisy channel to characterize the label errors.
    noise_matrix = generate_noise_matrix_from_trace(
        K=num_classes,
        trace=num_classes * avg_trace,
        py=py,
        frac_zero_noise_rates=frac_zero_noise_rates,
    )
    print_noise_matrix(noise_matrix)
    # Create the noisy labels. This method is exact w.r.t. the noise_matrix.
    y_train_with_errors = generate_noisy_labels(y_train, noise_matrix)
    lnl_cv = GridSearch(
        model=LearningWithNoisyLabels(clf),
        param_grid=param_grid,
        num_threads=4,
        seed=0,
    )
    lnl_cv.fit(
        X_train=X_train,
        y_train=y_train_with_errors,
        X_val=X_val,
コード例 #11
0
def make_data(
        sparse=False,
        means=[[3, 2], [7, 7], [0, 8]],
        covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]],
        sizes=[80, 40, 40],
        avg_trace=0.8,
        seed=1,  # set to None for non-reproducible randomness
):
    np.random.seed(seed=seed)

    m = len(means)  # number of classes
    n = sum(sizes)
    data = []
    labels = []
    test_data = []
    test_labels = []

    for idx in range(m):
        data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        test_data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        labels.append(np.array([idx for i in range(sizes[idx])]))
        test_labels.append(np.array([idx for i in range(sizes[idx])]))
    X_train = np.vstack(data)
    y_train = np.hstack(labels)
    X_test = np.vstack(test_data)
    y_test = np.hstack(test_labels)

    if sparse:
        X_train = scipy.sparse.csr_matrix(X_train)
        X_test = scipy.sparse.csr_matrix(X_test)

    # Compute p(y=k)
    py = np.bincount(y_train) / float(len(y_train))

    noise_matrix = generate_noise_matrix_from_trace(
        m,
        trace=avg_trace * m,
        py=py,
        valid_noise_matrix=True,
        seed=seed,
    )

    # Generate our noisy labels using the noise_marix.
    s = generate_noisy_labels(y_train, noise_matrix)
    ps = np.bincount(s) / float(len(s))

    # Compute inverse noise matrix
    inv = compute_inv_noise_matrix(py, noise_matrix, ps)

    # Estimate psx
    latent = latent_estimation.estimate_py_noise_matrices_and_cv_pred_proba(
        X=X_train,
        s=s,
        cv_n_folds=3,
    )

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "s": s,
        "ps": ps,
        "py": py,
        "noise_matrix": noise_matrix,
        "inverse_noise_matrix": inv,
        "est_py": latent[0],
        "est_nm": latent[1],
        "est_inv": latent[2],
        "cj": latent[3],
        "psx": latent[4],
        "m": m,
        "n": n,
    }
コード例 #12
0
pys = [[.01, .39, .6], [0.2] * 5, [.1, .3, .6], [.3, .3, .4], [.2, .6, .2],
       [.1, .2, .7], [.1, .15, .15, .6], [.1, .1, .2, .6],
       [1. / 3, 1. / 3, 1. / 3], [.02, .05, .08, .15, .15, .25, .3],
       [.05, .1, .15, .35, 0.35]]
pys = pys + [
    np.arange(1, k + 1, dtype=float) / sum(range(k + 1)) for k in range(2, 14)
]
for py in np.random.choice(pys, number_of_polyplices_to_display):
    ax = draw_polyplex(py)

    K = len(py)
    for trace in np.arange(K / 33., K, K / 33.):
        #         py = np.arange(1,K+1, dtype=float) / sum(range(K+1))
        nm = generate_noise_matrix_from_trace(K,
                                              trace,
                                              valid_noise_matrix=False,
                                              py=py)
        valid = noise_matrix_is_valid(nm, py)
        joint_trace = np.trace(nm * py)
        #         print('ps is', (nm*py).sum(axis=1))
        _ = ax.text(trace,
                    joint_trace,
                    s='v' if valid else 'n',
                    color='red',
                    size=30)

#     for z in np.arange(100):
# #         py = np.arange(1,K+1, dtype=float) / sum(range(K+1))
#         nm = generate_noise_matrix_from_trace(K, nm_avg_trace, valid_noise_matrix=True, py=py)
#         joint_trace = np.trace(nm*py)
# #         print('ps is', (nm*py).sum(axis=1))
コード例 #13
0
                                                      y_train,
                                                      test_size=.25,
                                                      random_state=1)
    num_classes = len(np.unique(y_train))
    print('Running dataset', ds_cnt + 1, 'with m =', num_classes,
          'classes and n =', len(X_train), 'training examples.')

    # CONFIDENT LEARNING COMPONENT

    np.random.seed(seed=0)

    py = np.bincount(y_train) / float(len(y_train))
    # Generate the noisy channel to characterize the label errors.
    noise_matrix = generate_noise_matrix_from_trace(
        K=num_classes,
        trace=num_classes * avg_trace,
        py=py,
        frac_zero_noise_rates=FRAC_ZERO_NOISE_RATES,
    )
    print_noise_matrix(noise_matrix)
    np.random.seed(seed=1)
    # Create the noisy labels. This method is exact w.r.t. the noise_matrix.
    y_train_w_errors = generate_noisy_labels(y_train, noise_matrix)

    clf_results = {}
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        # Create four copies of the classifier.
        # perf_label_clf - Will be trained on the hidden, noise-free labels
        # noisy_clf - Will be trained on the noisy labels
        # noisy_clf_w_rp - Will be trained on the noisy labels using LearningWithNoisyLabels
コード例 #14
0
        multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx]))
    test_data.append(
        multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx]))
    labels.append(np.array([idx for i in range(sizes[idx])]))
    test_labels.append(np.array([idx for i in range(sizes[idx])]))
X_train = np.vstack(data)
y_train = np.hstack(labels)
X_test = np.vstack(test_data)
y_test = np.hstack(test_labels)

# Compute p(y=k)
py = np.bincount(y_train) / float(len(y_train))

noise_matrix = generate_noise_matrix_from_trace(
    K,
    trace=1.5,
    py=py,
    valid_noise_matrix=True,
)

# Generate our noisy labels using the noise_marix.
s = generate_noisy_labels(y_train, noise_matrix)
ps = np.bincount(s) / float(len(s))

confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(X_train,
                                                                  s,
                                                                  seed=seed)
est_py, est_noise_matrix, est_inverse_noise_matrix = estimate_latent(
    confident_joint, s)
idx_errors = get_noise_indices(s, psx)

# #### To show off the power of **cleanlab**, we've chosen an example of multiclass learning with noisy labels in which over 50% of the training labels are wrong.
コード例 #15
0
                                 (0.2023, 0.1994, 0.2010)),
        ]),
    )
    y = train_dataset.targets
    K = int(cifar_dataset[5:])
    print(cifar_dataset, np.bincount(y))
    for frac_zero_noise_rates in np.arange(0, 0.8, 0.2):
        for noise_amount in np.arange(0, 1, 0.2):
            print('noise_amount', round(noise_amount, 1),
                  '| frac_zero_noise_rates', round(frac_zero_noise_rates, 1))

            # Generate class-conditional noise
            nm = noise_generation.generate_noise_matrix_from_trace(
                K=K,
                trace=int(K * (1 - noise_amount)),
                valid_noise_matrix=False,
                frac_zero_noise_rates=frac_zero_noise_rates,
                seed=0,
            )

            # noise matrix is valid if diagonal maximizes row and column
            valid = all((nm.argmax(axis=0) == range(K))
                        & (nm.argmax(axis=1) == range(K)))
            print('valid:', valid)

            # Create noisy labels
            np.random.seed(seed=0)
            s = noise_generation.generate_noisy_labels(y, nm)

            # Check accuracy of s and y
            print('Accuracy of s and y:', sum(s == y) / len(s))
コード例 #16
0
ファイル: experiments.py プロジェクト: masakiaota/blog
print(
    "target names", target_names
)  # C→corporate industrial, E→economics, G→goverment, M→わからない...Market?
mask_row = (
    data.target[:, mask_col].toarray().sum(axis=1) == 1
)  # マルチクラスが割り当てられているサンプルは削除
y = data.target[mask_row][:, mask_col]
X = data.data[mask_row]
py = y.toarray().sum(axis=0).reshape(-1)  # given labelの数
print("samples", X.shape[0], "category value counts", py)
y = np.array(y.argmax(axis=1)).reshape(-1)  # one-hot to num


# generate noise matrix
noise_matrix = generate_noise_matrix_from_trace(
    4, 3, min_trace_prob=0.6, frac_zero_noise_rates=0.5, py=py, seed=seed,
)
print("p(given=i|true=j) =")
print(noise_matrix)
"""
p(given=i|true=j) =
[[0.68936167 0.         0.         0.        ]
 [0.2387445  0.85410683 0.21184431 0.05112328]
 [0.         0.14589317 0.78815569 0.28050091]
 [0.07189383 0.         0.         0.66837581]]
"""
# define base Classifier
baseclf = LogisticRegression
params = {
    "solver": "liblinear",
    "multi_class": "auto",