def test_main_pipeline( verbose = False, n = 10, valid_noise_matrix = True, frac_zero_noise_rates = 0, ): trace = 1.5 py = [0.1, 0.1, 0.2, 0.6] K = len(py) y = [z for i,p in enumerate(py) for z in [i]*int(p*n)] nm = noise_generation.generate_noise_matrix_from_trace( K = K, trace = trace, py = py, seed = 0, valid_noise_matrix = valid_noise_matrix, frac_zero_noise_rates = frac_zero_noise_rates, ) # Check that trace is what its supposed to be assert(abs(trace - np.trace(nm) < 1e-2)) # Check that sum of probabilities is K assert(abs(nm.sum() - K) < 1e-4) # Check that sum of each column is 1 assert(all(abs(nm.sum(axis = 0) - 1) < 1e-4)) # Check that joint sums to 1. assert(abs(np.sum(nm*py) - 1 < 1e-4)) s = noise_generation.generate_noisy_labels(y, nm, verbose) assert(noise_generation.noise_matrix_is_valid(nm, py, verbose))
def make_data( means=[[3, 2], [7, 7], [0, 8]], covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]], sizes=[800, 400, 400], avg_trace=0.8, seed=0, # set to None for non-reproducible randomness ): np.random.seed(seed=seed) K = len(means) # number of classes data = [] labels = [] test_data = [] test_labels = [] for idx in range(K): data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( K, trace=avg_trace * K, py=py, valid_noise_matrix=True, seed=seed, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "s": s, "ps": ps, "py": py, "noise_matrix": noise_matrix, }
def add_outlier(y_train): seed = random.randint(0, 1000) a, py = np.unique(y_train, return_counts=True) noise_matrix = generate_noise_matrix_from_trace(2, 1.95, min_trace_prob=0.15, py=py, seed=seed) np.random.seed(seed) y_train_corrupted = generate_noisy_labels(y_train, noise_matrix) y_train_is_error = y_train_corrupted != y_train n = y_train_is_error.sum() return y_train_corrupted, int(n / len(y_train) * 100)
]: print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name)) np.random.seed(seed=0) clf_copy = copy.deepcopy(clf) # Compute p(y=k), the ground truth class prior on the labels. py = np.bincount(y_train) / float(len(y_train)) # Generate the noisy channel to characterize the label errors. noise_matrix = generate_noise_matrix_from_trace( K=num_classes, trace=num_classes * avg_trace, py=py, frac_zero_noise_rates=frac_zero_noise_rates, ) print_noise_matrix(noise_matrix) # Create the noisy labels. This method is exact w.r.t. the noise_matrix. y_train_with_errors = generate_noisy_labels(y_train, noise_matrix) lnl_cv = GridSearch( model=LearningWithNoisyLabels(clf), param_grid=param_grid, num_threads=4, seed=0, ) lnl_cv.fit( X_train=X_train, y_train=y_train_with_errors, X_val=X_val, y_val=y_val, verbose=False, ) # Also compute the test score with default parameters clf_copy.fit(X_train, y_train_with_errors)
def make_data( sparse=False, means=[[3, 2], [7, 7], [0, 8]], covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]], sizes=[80, 40, 40], avg_trace=0.8, seed=1, # set to None for non-reproducible randomness ): np.random.seed(seed=seed) m = len(means) # number of classes n = sum(sizes) data = [] labels = [] test_data = [] test_labels = [] for idx in range(m): data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) if sparse: X_train = scipy.sparse.csr_matrix(X_train) X_test = scipy.sparse.csr_matrix(X_test) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( m, trace=avg_trace * m, py=py, valid_noise_matrix=True, seed=seed, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) # Compute inverse noise matrix inv = compute_inv_noise_matrix(py, noise_matrix, ps) # Estimate psx latent = latent_estimation.estimate_py_noise_matrices_and_cv_pred_proba( X=X_train, s=s, cv_n_folds=3, ) return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "s": s, "ps": ps, "py": py, "noise_matrix": noise_matrix, "inverse_noise_matrix": inv, "est_py": latent[0], "est_nm": latent[1], "est_inv": latent[2], "cj": latent[3], "psx": latent[4], "m": m, "n": n, }
except Exception as e: print(e) print("Plotting is only supported in an iPython interface.") # In[3]: # Generate lots of noise. noise_matrix = np.array([ [0.5, 0.0, 0.0], [0.5, 1.0, 0.5], [0.0, 0.0, 0.5], ]) py = value_counts(y_train) # Create noisy labels s = generate_noisy_labels(y_train, noise_matrix) try: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt _ = plt.figure(figsize=(15, 8)) color_list = plt.cm.tab10(np.linspace(0, 1, 6)) for k in range(len(np.unique(y_train))): X_k = X_train[y_train == k] # data for class k _ = plt.scatter( X_k[:, 1], X_k[:, 3], color=[color_list[noisy_label] for noisy_label in s[y_train == k]], s=200, marker=r"${a}$".format(a=str(k)),
nm = noise_generation.generate_noise_matrix_from_trace( K=K, trace=int(K * (1 - noise_amount)), valid_noise_matrix=False, frac_zero_noise_rates=frac_zero_noise_rates, seed=0, ) # noise matrix is valid if diagonal maximizes row and column valid = all((nm.argmax(axis=0) == range(K)) & (nm.argmax(axis=1) == range(K))) print('valid:', valid) # Create noisy labels np.random.seed(seed=0) s = noise_generation.generate_noisy_labels(y, nm) # Check accuracy of s and y print('Accuracy of s and y:', sum(s == y) / len(s)) # Create map of filenames to noisy labels d = dict( zip([i for i, j in train_dataset.imgs], [int(i) for i in s])) # Store dictionary as json wfn_base = '{}_noisy_labels__frac_zero_noise_rates__{}__noise_amount__{}'.format( cifar_dataset, "0.0" if frac_zero_noise_rates < 1e-4 else round( frac_zero_noise_rates, 1), "0.0" if noise_amount < 1e-4 else round(noise_amount, 1), )
y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo]) X_for_pseudo = sp.vstack([X_train, X_test]) # pseudo込の全データでtrain model.fit(X_for_pseudo, y_pseudo) return model.score(X_test, y_test) # cross validation cv = KFold(4, shuffle=True, random_state=seed) result = defaultdict(lambda: []) for i, (train_idx, test_idx) in enumerate(cv.split(X, y)): X_train, y_train = X[train_idx], y[train_idx] X_test, y_test = X[test_idx], y[test_idx] y_train_corrupted = generate_noisy_labels(y_train, noise_matrix) result["ML:clean"].append(normal_learning( X_train, y_train, X_test, y_test)) result["ML:noisy"].append( normal_learning(X_train, y_train_corrupted, X_test, y_test) ) clclf_trained = ret_trainedCLclass( X_train, y_train_corrupted, X_test, y_test) result["CL:wituout noisy labels"].append( train_without_noisy_labels( X_train, y_train_corrupted, X_test, y_test, clf=clclf_trained) ) result["CL:pseudo for noisy labels"].append( train_noisy_to_pseudo(X_train, y_train_corrupted, X_test, y_test, clf=clclf_trained)