def test_loaders( seed=0, n=300, # Number of training examples to use pretrain_epochs=2, # Increase to at least 10 for good results ): """This is going to OVERFIT - train and test on the SAME SET. The goal of this test is just to make sure the data loads correctly. And all the main functions work.""" from cleanlab.latent_estimation import ( estimate_confident_joint_and_cv_pred_proba, estimate_latent) if python_version.is_compatible(): np.random.seed(seed) cnn = CNN(epochs=3, log_interval=1000, loader='train', seed=0) idx = np.random.choice(X_train, n, replace=False) # Grab n random examples. test_idx = np.random.choice(X_test, n, replace=False) # Grab n random examples. prune_method = 'prune_by_noise_rate' # Pre-train cnn = CNN(epochs=1, log_interval=None, seed=seed) # pre-train score = 0 for loader in ['train', 'test', None]: print('loader:', loader) prev_score = score X = X_test[test_idx] if loader == 'test' else X_train[idx] y = y_test[test_idx] if loader == 'test' else y_train[idx] # Setting this overides all future functions. cnn.loader = loader # pre-train (overfit, not out-of-sample) to entire dataset. cnn.fit(X, None, loader='train') # Out-of-sample cross-validated holdout predicted probabilities np.random.seed(seed) # Single epoch for cross-validation (already pre-trained) cnn.epochs = 1 cj, psx = estimate_confident_joint_and_cv_pred_proba(X, y, cnn, cv_n_folds=2) est_py, est_nm, est_inv = estimate_latent(cj, y) # algorithmic identification of label errors noise_idx = cleanlab.pruning.get_noise_indices( y, psx, est_inv, prune_method=prune_method) # Get prediction on loader set (in this case same as train set) pred = cnn.predict(X, loader='train') score = accuracy_score(y, pred) print(score) assert (score > prev_score) # Scores should increase assert True
def test_loaders( seed=0, ): """This is going to OVERFIT - train and test on the SAME SET. The goal of this test is just to make sure the data loads correctly. And all the main functions work.""" from cleanlab.latent_estimation import ( estimate_confident_joint_and_cv_pred_proba, estimate_latent) if python_version.is_compatible(): np.random.seed(seed) prune_method = 'prune_by_noise_rate' # Pre-train for only 3 epochs (it maxes out around 8-12 epochs) cnn = CNN(epochs=3, log_interval=None, seed=seed, dataset='sklearn-digits') score = 0 for loader in ['train', 'test', None]: print('loader:', loader) prev_score = score X = X_test_idx if loader == 'test' else X_train_idx y = y_test if loader == 'test' else y_train # Setting this overrides all future functions. cnn.loader = loader # pre-train (overfit, not out-of-sample) to entire dataset. cnn.fit(X, None, ) # This next block of code checks if cleanlab works with the CNN # Out-of-sample cross-validated holdout predicted probabilities np.random.seed(seed) # Single epoch for cross-validation (already pre-trained) cnn.epochs = 1 cj, psx = estimate_confident_joint_and_cv_pred_proba( X, y, cnn, cv_n_folds=2) est_py, est_nm, est_inv = estimate_latent(cj, y) # algorithmic identification of label errors noise_idx = cleanlab.pruning.get_noise_indices( y, psx, est_inv, prune_method=prune_method) assert noise_idx is not None # Get prediction on loader set. pred = cnn.predict(X) score = accuracy_score(y, pred) print('Acc Before: {:.2f}, After: {:.2f}'.format(prev_score, score)) assert (score > prev_score) # Scores should increase assert True
# ## Show the result generalizes for different seed values. # In[ ]: # Initialize constants max_images = 24 np.random.seed(43) prune_method = 'prune_by_noise_rate' # Pre-train cnn = CNN(epochs=15, log_interval=None, loader='train') #pre-train cnn.fit(X_train, y_train, loader='train' ) # pre-train (overfit, not out-of-sample) to entire dataset. params = cnn.model.state_dict() # store CNN's weights after pretraining cnn.epochs = 1 # Single epoch for cross-validation (already pre-trained) for seed in range(21, 35): np.random.seed(seed) cnn.model.load_state_dict(params) cj, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba( X_train, y_train, clf=cnn, ) est_py, est_nm, est_inv = cleanlab.latent_estimation.estimate_latent( cj, y_train) noise_idx = cleanlab.pruning.get_noise_indices(y_train, psx, est_inv, prune_method=prune_method, confident_joint=cj)