def test_trace_less_than_1_error(trace = 0.5): try: noise_generation.generate_noise_matrix_from_trace(3, trace) except ValueError as e: assert('trace > 1' in str(e)) with pytest.raises(ValueError) as e: noise_generation.generate_noise_matrix_from_trace(3, trace)
def test_one_class_error(): try: noise_generation.generate_noise_matrix_from_trace( K = 1, trace = 2, ) except ValueError as e: assert('must be >= 2' in str(e)) with pytest.raises(ValueError) as e: noise_generation.generate_noise_matrix_from_trace( K = 1, trace = 1, )
def test_valid_no_py_error(): try: noise_generation.generate_noise_matrix_from_trace( K = 3, trace = 2, valid_noise_matrix = True, ) except ValueError as e: assert('py must be' in str(e)) with pytest.raises(ValueError) as e: noise_generation.generate_noise_matrix_from_trace( K = 3, trace = 2, valid_noise_matrix = True, )
def test_main_pipeline( verbose = False, n = 10, valid_noise_matrix = True, frac_zero_noise_rates = 0, ): trace = 1.5 py = [0.1, 0.1, 0.2, 0.6] K = len(py) y = [z for i,p in enumerate(py) for z in [i]*int(p*n)] nm = noise_generation.generate_noise_matrix_from_trace( K = K, trace = trace, py = py, seed = 0, valid_noise_matrix = valid_noise_matrix, frac_zero_noise_rates = frac_zero_noise_rates, ) # Check that trace is what its supposed to be assert(abs(trace - np.trace(nm) < 1e-2)) # Check that sum of probabilities is K assert(abs(nm.sum() - K) < 1e-4) # Check that sum of each column is 1 assert(all(abs(nm.sum(axis = 0) - 1) < 1e-4)) # Check that joint sums to 1. assert(abs(np.sum(nm*py) - 1 < 1e-4)) s = noise_generation.generate_noisy_labels(y, nm, verbose) assert(noise_generation.noise_matrix_is_valid(nm, py, verbose))
def make_data( means=[[3, 2], [7, 7], [0, 8]], covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]], sizes=[800, 400, 400], avg_trace=0.8, seed=0, # set to None for non-reproducible randomness ): np.random.seed(seed=seed) K = len(means) # number of classes data = [] labels = [] test_data = [] test_labels = [] for idx in range(K): data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( K, trace=avg_trace * K, py=py, valid_noise_matrix=True, seed=seed, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "s": s, "ps": ps, "py": py, "noise_matrix": noise_matrix, }
def test_two_class_nofraczero(): trace = 1.1 nm = noise_generation.generate_noise_matrix_from_trace( K = 2, trace = trace, valid_noise_matrix = True, ) assert(not np.any(nm == 0)) # Make sure there is not a zero noise rate. assert(abs(trace - np.trace(nm) < 1e-2))
def test_two_class_fraczero_high(valid = False): trace = 1.8 frac_zero_noise_rates = 0.75 nm = noise_generation.generate_noise_matrix_from_trace( K = 2, trace = trace, valid_noise_matrix = valid, frac_zero_noise_rates = frac_zero_noise_rates, ) assert(np.any(nm == 0)) # Make sure there is a zero noise rate. assert(abs(trace - np.trace(nm) < 1e-2))
def test_max_iter(): trace = 2 K = 3 py = [1/float(K)]*K nm = noise_generation.generate_noise_matrix_from_trace( K = K, trace = trace, valid_noise_matrix = True, max_iter = 1, py = py, seed = 1, ) assert(abs(np.trace(nm) - trace) < 1e-6) assert(abs(sum(np.dot(nm, py)) - 1) < 1e-6) nm2 = noise_generation.generate_noise_matrix_from_trace( K = 3, trace = trace, valid_noise_matrix = True, py = [0.1, 0.1, 0.8], max_iter = 0, ) assert(nm2 == False)
def add_outlier(y_train): seed = random.randint(0, 1000) a, py = np.unique(y_train, return_counts=True) noise_matrix = generate_noise_matrix_from_trace(2, 1.95, min_trace_prob=0.15, py=py, seed=seed) np.random.seed(seed) y_train_corrupted = generate_noisy_labels(y_train, noise_matrix) y_train_is_error = y_train_corrupted != y_train n = y_train_is_error.sum() return y_train_corrupted, int(n / len(y_train) * 100)
GaussianNB(), ), ( "Logistic Regression", LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto'), ), ]: print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name)) np.random.seed(seed=0) clf_copy = copy.deepcopy(clf) # Compute p(y=k), the ground truth class prior on the labels. py = np.bincount(y_train) / float(len(y_train)) # Generate the noisy channel to characterize the label errors. noise_matrix = generate_noise_matrix_from_trace( K=num_classes, trace=num_classes * avg_trace, py=py, frac_zero_noise_rates=frac_zero_noise_rates, ) print_noise_matrix(noise_matrix) # Create the noisy labels. This method is exact w.r.t. the noise_matrix. y_train_with_errors = generate_noisy_labels(y_train, noise_matrix) lnl_cv = GridSearch( model=LearningWithNoisyLabels(clf), param_grid=param_grid, num_threads=4, seed=0, ) lnl_cv.fit( X_train=X_train, y_train=y_train_with_errors, X_val=X_val,
def make_data( sparse=False, means=[[3, 2], [7, 7], [0, 8]], covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]], sizes=[80, 40, 40], avg_trace=0.8, seed=1, # set to None for non-reproducible randomness ): np.random.seed(seed=seed) m = len(means) # number of classes n = sum(sizes) data = [] labels = [] test_data = [] test_labels = [] for idx in range(m): data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) if sparse: X_train = scipy.sparse.csr_matrix(X_train) X_test = scipy.sparse.csr_matrix(X_test) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( m, trace=avg_trace * m, py=py, valid_noise_matrix=True, seed=seed, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) # Compute inverse noise matrix inv = compute_inv_noise_matrix(py, noise_matrix, ps) # Estimate psx latent = latent_estimation.estimate_py_noise_matrices_and_cv_pred_proba( X=X_train, s=s, cv_n_folds=3, ) return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "s": s, "ps": ps, "py": py, "noise_matrix": noise_matrix, "inverse_noise_matrix": inv, "est_py": latent[0], "est_nm": latent[1], "est_inv": latent[2], "cj": latent[3], "psx": latent[4], "m": m, "n": n, }
pys = [[.01, .39, .6], [0.2] * 5, [.1, .3, .6], [.3, .3, .4], [.2, .6, .2], [.1, .2, .7], [.1, .15, .15, .6], [.1, .1, .2, .6], [1. / 3, 1. / 3, 1. / 3], [.02, .05, .08, .15, .15, .25, .3], [.05, .1, .15, .35, 0.35]] pys = pys + [ np.arange(1, k + 1, dtype=float) / sum(range(k + 1)) for k in range(2, 14) ] for py in np.random.choice(pys, number_of_polyplices_to_display): ax = draw_polyplex(py) K = len(py) for trace in np.arange(K / 33., K, K / 33.): # py = np.arange(1,K+1, dtype=float) / sum(range(K+1)) nm = generate_noise_matrix_from_trace(K, trace, valid_noise_matrix=False, py=py) valid = noise_matrix_is_valid(nm, py) joint_trace = np.trace(nm * py) # print('ps is', (nm*py).sum(axis=1)) _ = ax.text(trace, joint_trace, s='v' if valid else 'n', color='red', size=30) # for z in np.arange(100): # # py = np.arange(1,K+1, dtype=float) / sum(range(K+1)) # nm = generate_noise_matrix_from_trace(K, nm_avg_trace, valid_noise_matrix=True, py=py) # joint_trace = np.trace(nm*py) # # print('ps is', (nm*py).sum(axis=1))
y_train, test_size=.25, random_state=1) num_classes = len(np.unique(y_train)) print('Running dataset', ds_cnt + 1, 'with m =', num_classes, 'classes and n =', len(X_train), 'training examples.') # CONFIDENT LEARNING COMPONENT np.random.seed(seed=0) py = np.bincount(y_train) / float(len(y_train)) # Generate the noisy channel to characterize the label errors. noise_matrix = generate_noise_matrix_from_trace( K=num_classes, trace=num_classes * avg_trace, py=py, frac_zero_noise_rates=FRAC_ZERO_NOISE_RATES, ) print_noise_matrix(noise_matrix) np.random.seed(seed=1) # Create the noisy labels. This method is exact w.r.t. the noise_matrix. y_train_w_errors = generate_noisy_labels(y_train, noise_matrix) clf_results = {} # iterate over classifiers for name, clf in zip(names, classifiers): # Create four copies of the classifier. # perf_label_clf - Will be trained on the hidden, noise-free labels # noisy_clf - Will be trained on the noisy labels # noisy_clf_w_rp - Will be trained on the noisy labels using LearningWithNoisyLabels
multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( K, trace=1.5, py=py, valid_noise_matrix=True, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(X_train, s, seed=seed) est_py, est_noise_matrix, est_inverse_noise_matrix = estimate_latent( confident_joint, s) idx_errors = get_noise_indices(s, psx) # #### To show off the power of **cleanlab**, we've chosen an example of multiclass learning with noisy labels in which over 50% of the training labels are wrong.
(0.2023, 0.1994, 0.2010)), ]), ) y = train_dataset.targets K = int(cifar_dataset[5:]) print(cifar_dataset, np.bincount(y)) for frac_zero_noise_rates in np.arange(0, 0.8, 0.2): for noise_amount in np.arange(0, 1, 0.2): print('noise_amount', round(noise_amount, 1), '| frac_zero_noise_rates', round(frac_zero_noise_rates, 1)) # Generate class-conditional noise nm = noise_generation.generate_noise_matrix_from_trace( K=K, trace=int(K * (1 - noise_amount)), valid_noise_matrix=False, frac_zero_noise_rates=frac_zero_noise_rates, seed=0, ) # noise matrix is valid if diagonal maximizes row and column valid = all((nm.argmax(axis=0) == range(K)) & (nm.argmax(axis=1) == range(K))) print('valid:', valid) # Create noisy labels np.random.seed(seed=0) s = noise_generation.generate_noisy_labels(y, nm) # Check accuracy of s and y print('Accuracy of s and y:', sum(s == y) / len(s))
print( "target names", target_names ) # C→corporate industrial, E→economics, G→goverment, M→わからない...Market? mask_row = ( data.target[:, mask_col].toarray().sum(axis=1) == 1 ) # マルチクラスが割り当てられているサンプルは削除 y = data.target[mask_row][:, mask_col] X = data.data[mask_row] py = y.toarray().sum(axis=0).reshape(-1) # given labelの数 print("samples", X.shape[0], "category value counts", py) y = np.array(y.argmax(axis=1)).reshape(-1) # one-hot to num # generate noise matrix noise_matrix = generate_noise_matrix_from_trace( 4, 3, min_trace_prob=0.6, frac_zero_noise_rates=0.5, py=py, seed=seed, ) print("p(given=i|true=j) =") print(noise_matrix) """ p(given=i|true=j) = [[0.68936167 0. 0. 0. ] [0.2387445 0.85410683 0.21184431 0.05112328] [0. 0.14589317 0.78815569 0.28050091] [0.07189383 0. 0. 0.66837581]] """ # define base Classifier baseclf = LogisticRegression params = { "solver": "liblinear", "multi_class": "auto",