Esempio n. 1
0
def test_main_pipeline(
    verbose = False,
    n = 10,
    valid_noise_matrix = True,
    frac_zero_noise_rates = 0,
):
    trace = 1.5
    py = [0.1, 0.1, 0.2, 0.6]
    K = len(py)
    y = [z for i,p in enumerate(py) for z in [i]*int(p*n)]
    nm = noise_generation.generate_noise_matrix_from_trace(
        K = K,
        trace = trace,
        py = py,
        seed = 0,
        valid_noise_matrix = valid_noise_matrix,
        frac_zero_noise_rates = frac_zero_noise_rates,
    )
    # Check that trace is what its supposed to be
    assert(abs(trace - np.trace(nm) < 1e-2))
    # Check that sum of probabilities is K
    assert(abs(nm.sum() - K) < 1e-4)
    # Check that sum of each column is 1
    assert(all(abs(nm.sum(axis = 0) - 1) < 1e-4))
    # Check that joint sums to 1.
    assert(abs(np.sum(nm*py) - 1 < 1e-4))
    s = noise_generation.generate_noisy_labels(y, nm, verbose)
    assert(noise_generation.noise_matrix_is_valid(nm, py, verbose))
Esempio n. 2
0
def make_data(
        means=[[3, 2], [7, 7], [0, 8]],
        covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]],
        sizes=[800, 400, 400],
        avg_trace=0.8,
        seed=0,  # set to None for non-reproducible randomness
):

    np.random.seed(seed=seed)

    K = len(means)  # number of classes
    data = []
    labels = []
    test_data = []
    test_labels = []

    for idx in range(K):
        data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        test_data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        labels.append(np.array([idx for i in range(sizes[idx])]))
        test_labels.append(np.array([idx for i in range(sizes[idx])]))
    X_train = np.vstack(data)
    y_train = np.hstack(labels)
    X_test = np.vstack(test_data)
    y_test = np.hstack(test_labels)

    # Compute p(y=k)
    py = np.bincount(y_train) / float(len(y_train))

    noise_matrix = generate_noise_matrix_from_trace(
        K,
        trace=avg_trace * K,
        py=py,
        valid_noise_matrix=True,
        seed=seed,
    )

    # Generate our noisy labels using the noise_marix.
    s = generate_noisy_labels(y_train, noise_matrix)
    ps = np.bincount(s) / float(len(s))

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "s": s,
        "ps": ps,
        "py": py,
        "noise_matrix": noise_matrix,
    }
def add_outlier(y_train):
    seed = random.randint(0, 1000)
    a, py = np.unique(y_train, return_counts=True)
    noise_matrix = generate_noise_matrix_from_trace(2,
                                                    1.95,
                                                    min_trace_prob=0.15,
                                                    py=py,
                                                    seed=seed)
    np.random.seed(seed)
    y_train_corrupted = generate_noisy_labels(y_train, noise_matrix)
    y_train_is_error = y_train_corrupted != y_train
    n = y_train_is_error.sum()
    return y_train_corrupted, int(n / len(y_train) * 100)
Esempio n. 4
0
]:
    print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name))
    np.random.seed(seed=0)
    clf_copy = copy.deepcopy(clf)
    # Compute p(y=k), the ground truth class prior on the labels.
    py = np.bincount(y_train) / float(len(y_train))
    # Generate the noisy channel to characterize the label errors.
    noise_matrix = generate_noise_matrix_from_trace(
        K=num_classes,
        trace=num_classes * avg_trace,
        py=py,
        frac_zero_noise_rates=frac_zero_noise_rates,
    )
    print_noise_matrix(noise_matrix)
    # Create the noisy labels. This method is exact w.r.t. the noise_matrix.
    y_train_with_errors = generate_noisy_labels(y_train, noise_matrix)
    lnl_cv = GridSearch(
        model=LearningWithNoisyLabels(clf),
        param_grid=param_grid,
        num_threads=4,
        seed=0,
    )
    lnl_cv.fit(
        X_train=X_train,
        y_train=y_train_with_errors,
        X_val=X_val,
        y_val=y_val,
        verbose=False,
    )
    # Also compute the test score with default parameters
    clf_copy.fit(X_train, y_train_with_errors)
Esempio n. 5
0
def make_data(
        sparse=False,
        means=[[3, 2], [7, 7], [0, 8]],
        covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]],
        sizes=[80, 40, 40],
        avg_trace=0.8,
        seed=1,  # set to None for non-reproducible randomness
):
    np.random.seed(seed=seed)

    m = len(means)  # number of classes
    n = sum(sizes)
    data = []
    labels = []
    test_data = []
    test_labels = []

    for idx in range(m):
        data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        test_data.append(
            np.random.multivariate_normal(mean=means[idx],
                                          cov=covs[idx],
                                          size=sizes[idx]))
        labels.append(np.array([idx for i in range(sizes[idx])]))
        test_labels.append(np.array([idx for i in range(sizes[idx])]))
    X_train = np.vstack(data)
    y_train = np.hstack(labels)
    X_test = np.vstack(test_data)
    y_test = np.hstack(test_labels)

    if sparse:
        X_train = scipy.sparse.csr_matrix(X_train)
        X_test = scipy.sparse.csr_matrix(X_test)

    # Compute p(y=k)
    py = np.bincount(y_train) / float(len(y_train))

    noise_matrix = generate_noise_matrix_from_trace(
        m,
        trace=avg_trace * m,
        py=py,
        valid_noise_matrix=True,
        seed=seed,
    )

    # Generate our noisy labels using the noise_marix.
    s = generate_noisy_labels(y_train, noise_matrix)
    ps = np.bincount(s) / float(len(s))

    # Compute inverse noise matrix
    inv = compute_inv_noise_matrix(py, noise_matrix, ps)

    # Estimate psx
    latent = latent_estimation.estimate_py_noise_matrices_and_cv_pred_proba(
        X=X_train,
        s=s,
        cv_n_folds=3,
    )

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "s": s,
        "ps": ps,
        "py": py,
        "noise_matrix": noise_matrix,
        "inverse_noise_matrix": inv,
        "est_py": latent[0],
        "est_nm": latent[1],
        "est_inv": latent[2],
        "cj": latent[3],
        "psx": latent[4],
        "m": m,
        "n": n,
    }
Esempio n. 6
0
except Exception as e:
    print(e)
    print("Plotting is only supported in an iPython interface.")

# In[3]:

# Generate lots of noise.
noise_matrix = np.array([
    [0.5, 0.0, 0.0],
    [0.5, 1.0, 0.5],
    [0.0, 0.0, 0.5],
])

py = value_counts(y_train)
# Create noisy labels
s = generate_noisy_labels(y_train, noise_matrix)

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt

    _ = plt.figure(figsize=(15, 8))
    color_list = plt.cm.tab10(np.linspace(0, 1, 6))
    for k in range(len(np.unique(y_train))):
        X_k = X_train[y_train == k]  # data for class k
        _ = plt.scatter(
            X_k[:, 1],
            X_k[:, 3],
            color=[color_list[noisy_label] for noisy_label in s[y_train == k]],
            s=200,
            marker=r"${a}$".format(a=str(k)),
            nm = noise_generation.generate_noise_matrix_from_trace(
                K=K,
                trace=int(K * (1 - noise_amount)),
                valid_noise_matrix=False,
                frac_zero_noise_rates=frac_zero_noise_rates,
                seed=0,
            )

            # noise matrix is valid if diagonal maximizes row and column
            valid = all((nm.argmax(axis=0) == range(K))
                        & (nm.argmax(axis=1) == range(K)))
            print('valid:', valid)

            # Create noisy labels
            np.random.seed(seed=0)
            s = noise_generation.generate_noisy_labels(y, nm)

            # Check accuracy of s and y
            print('Accuracy of s and y:', sum(s == y) / len(s))

            # Create map of filenames to noisy labels
            d = dict(
                zip([i for i, j in train_dataset.imgs], [int(i) for i in s]))

            # Store dictionary as json
            wfn_base = '{}_noisy_labels__frac_zero_noise_rates__{}__noise_amount__{}'.format(
                cifar_dataset,
                "0.0" if frac_zero_noise_rates < 1e-4 else round(
                    frac_zero_noise_rates, 1),
                "0.0" if noise_amount < 1e-4 else round(noise_amount, 1),
            )
Esempio n. 8
0
    y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo])
    X_for_pseudo = sp.vstack([X_train, X_test])

    # pseudo込の全データでtrain
    model.fit(X_for_pseudo, y_pseudo)

    return model.score(X_test, y_test)


# cross validation
cv = KFold(4, shuffle=True, random_state=seed)
result = defaultdict(lambda: [])
for i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    y_train_corrupted = generate_noisy_labels(y_train, noise_matrix)

    result["ML:clean"].append(normal_learning(
        X_train, y_train, X_test, y_test))
    result["ML:noisy"].append(
        normal_learning(X_train, y_train_corrupted, X_test, y_test)
    )
    clclf_trained = ret_trainedCLclass(
        X_train, y_train_corrupted, X_test, y_test)
    result["CL:wituout noisy labels"].append(
        train_without_noisy_labels(
            X_train, y_train_corrupted, X_test, y_test, clf=clclf_trained)
    )
    result["CL:pseudo for noisy labels"].append(
        train_noisy_to_pseudo(X_train, y_train_corrupted,
                              X_test, y_test, clf=clclf_trained)