LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto'), ), ]: print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name)) np.random.seed(seed=0) clf_copy = copy.deepcopy(clf) # Compute p(y=k), the ground truth class prior on the labels. py = np.bincount(y_train) / float(len(y_train)) # Generate the noisy channel to characterize the label errors. noise_matrix = generate_noise_matrix_from_trace( K=num_classes, trace=num_classes * avg_trace, py=py, frac_zero_noise_rates=frac_zero_noise_rates, ) print_noise_matrix(noise_matrix) # Create the noisy labels. This method is exact w.r.t. the noise_matrix. y_train_with_errors = generate_noisy_labels(y_train, noise_matrix) lnl_cv = GridSearch( model=LearningWithNoisyLabels(clf), param_grid=param_grid, num_threads=4, seed=0, ) lnl_cv.fit( X_train=X_train, y_train=y_train_with_errors, X_val=X_val, y_val=y_val, verbose=False, )
linewidth=1) _ = plt.scatter( X_train[~idx_errors][:, 0][s[~idx_errors] != y_train[~idx_errors]], X_train[~idx_errors][:, 1][s[~idx_errors] != y_train[~idx_errors]], s=400, facecolors='none', edgecolors='black', linewidth=2, alpha=0.5) _ = plt.title('Dataset after pruning detected label errors.', fontsize=30) plt.show() except: print("Plotting is only supported in an iPython interface.") print('The actual, latent, underlying noise matrix.') print_noise_matrix(noise_matrix) print('Our estimate of the noise matrix.') print_noise_matrix(est_noise_matrix) print() print('The actual, latent, underlying joint distribution matrix.') cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors) print('Our estimate of the joint distribution matrix.') cleanlab.util.print_joint_matrix(est_joint) print("Accuracy Comparison") print("-------------------") clf = LogisticRegression(solver='lbfgs', multi_class='auto') baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test)) print("Logistic regression:", baseline_score) rp = LearningWithNoisyLabels(seed=seed) rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test)) print("Logistic regression (+rankpruning):", rp_score)
def test_print_noise_matrix(): for m in [noise_matrix, noise_matrix_2, single_element]: util.print_noise_matrix(noise_matrix) assert (True)
# In[17]: # Create noisy labels for both CIFAR-10 and CIFAR-100 # Store dictionary as json import numpy as np import pickle from cleanlab import util for cifar_dataset in ["cifar10"]: #, "cifar100"]: data_path = '/datasets/datasets/{}/{}/'.format(cifar_dataset, cifar_dataset) for noise_amount in np.arange(0.2, 0.61, 0.2): for frac_zero_noise_rates in np.arange(0, 0.61, 0.2): # Print the noise matrix rfn_base = '{}_noisy_labels__frac_zero_noise_rates__{}__noise_amount__{}'.format( cifar_dataset, "0.0" if frac_zero_noise_rates < 1e-4 else round( frac_zero_noise_rates, 1), "0.0" if noise_amount < 1e-4 else round(noise_amount, 1), ) rfn = data_path + "noisy_labels/" + rfn_base rfn_base = "{}_noise_matrix".format( cifar_dataset) + "__" + "__".join(rfn_base.split("__")[1:]) rfn = data_path + "noisy_labels/" + rfn_base with open(rfn + ".pickle", 'rb') as rf: nm = pickle.load(rf) actual_noise = 0.7 if abs(noise_amount - 0.6) < 1e-3 else noise_amount print('Noise amount:', round(actual_noise, 3), "| Sparsity:", round(frac_zero_noise_rates, 3)) util.print_noise_matrix(nm)