Beispiel #1
0
def test_n_train_examples(n=500):
    if python_version.is_compatible():
        cnn = CNN(epochs=3, log_interval=1000, loader='train', seed=0)
        idx = np.random.choice(X_train, n,
                               replace=False)  # Grab n random examples.
        cnn.fit(train_idx=X_train[idx],
                train_labels=y_train[idx],
                loader='train')
        cnn.loader = 'test'
        pred = cnn.predict(X_test[:n])
        print(accuracy_score(y_test[:n], pred))
        assert (accuracy_score(y_test[:n], pred) > 0.1)

        # Check that dataset defaults to test set when an invalid name is given.
        cnn.loader = 'INVALID'
        pred = cnn.predict(X_test[:n])
        assert (len(pred) == MNIST_TEST_SIZE)

        # Check that pred_proba runs on all examples when None is passed in
        cnn.loader = 'test'
        proba = cnn.predict_proba(idx=None, loader='test')
        assert proba is not None
        assert (len(pred) == MNIST_TEST_SIZE)

    assert True
Beispiel #2
0
def test_loaders(
        seed=0,
        n=300,  # Number of training examples to use
        pretrain_epochs=2,  # Increase to at least 10 for good results
):
    """This is going to OVERFIT - train and test on the SAME SET.
    The goal of this test is just to make sure the data loads correctly.
    And all the main functions work."""

    from cleanlab.latent_estimation import (
        estimate_confident_joint_and_cv_pred_proba, estimate_latent)

    if python_version.is_compatible():
        np.random.seed(seed)
        cnn = CNN(epochs=3, log_interval=1000, loader='train', seed=0)
        idx = np.random.choice(X_train, n,
                               replace=False)  # Grab n random examples.
        test_idx = np.random.choice(X_test, n,
                                    replace=False)  # Grab n random examples.

        prune_method = 'prune_by_noise_rate'

        # Pre-train
        cnn = CNN(epochs=1, log_interval=None, seed=seed)  # pre-train
        score = 0
        for loader in ['train', 'test', None]:
            print('loader:', loader)
            prev_score = score
            X = X_test[test_idx] if loader == 'test' else X_train[idx]
            y = y_test[test_idx] if loader == 'test' else y_train[idx]
            # Setting this overides all future functions.
            cnn.loader = loader
            # pre-train (overfit, not out-of-sample) to entire dataset.
            cnn.fit(X, None, loader='train')

            # Out-of-sample cross-validated holdout predicted probabilities
            np.random.seed(seed)
            # Single epoch for cross-validation (already pre-trained)
            cnn.epochs = 1
            cj, psx = estimate_confident_joint_and_cv_pred_proba(X,
                                                                 y,
                                                                 cnn,
                                                                 cv_n_folds=2)
            est_py, est_nm, est_inv = estimate_latent(cj, y)
            # algorithmic identification of label errors
            noise_idx = cleanlab.pruning.get_noise_indices(
                y, psx, est_inv, prune_method=prune_method)

            # Get prediction on loader set (in this case same as train set)
            pred = cnn.predict(X, loader='train')
            score = accuracy_score(y, pred)
            print(score)
            assert (score > prev_score)  # Scores should increase

    assert True
Beispiel #3
0
def test_n_train_examples():
    if python_version.is_compatible():
        cnn = CNN(epochs=3, log_interval=1000, loader='train', seed=0,
                  dataset='sklearn-digits', )
        cnn.fit(train_idx=X_train_idx, train_labels=y_train,
                loader='train', )
        cnn.loader = 'test'
        pred = cnn.predict(X_test_idx)
        print(accuracy_score(y_test, pred))
        assert (accuracy_score(y_test, pred) > 0.1)

        # Check that exception is raised when invalid name is given.
        cnn.loader = 'INVALID'
        with pytest.raises(ValueError) as e:
            pred = cnn.predict(X_test_idx)

        # Check that pred_proba runs on all examples when None is passed in
        cnn.loader = 'test'
        proba = cnn.predict_proba(idx=None, loader='test')
        assert proba is not None
        assert (len(pred) == SKLEARN_DIGITS_TEST_SIZE)

    assert True
Beispiel #4
0
def test_throw_exception():
    if python_version.is_compatible():
        cnn = CNN(epochs=1, log_interval=1000, seed=0)
        try:
            cnn.fit(train_idx=[0, 1], train_labels=[1])
        except Exception as e:
            assert ('same length' in str(e))
            with pytest.raises(ValueError) as e:
                cnn.fit(train_idx=[0, 1], train_labels=[1])
    assert True
Beispiel #5
0
def test_loaders(
        seed=0,
):
    """This is going to OVERFIT - train and test on the SAME SET.
    The goal of this test is just to make sure the data loads correctly.
    And all the main functions work."""

    from cleanlab.latent_estimation import (
        estimate_confident_joint_and_cv_pred_proba, estimate_latent)

    if python_version.is_compatible():
        np.random.seed(seed)
        prune_method = 'prune_by_noise_rate'
        # Pre-train for only 3 epochs (it maxes out around 8-12 epochs)
        cnn = CNN(epochs=3, log_interval=None, seed=seed,
                  dataset='sklearn-digits')
        score = 0
        for loader in ['train', 'test', None]:
            print('loader:', loader)
            prev_score = score
            X = X_test_idx if loader == 'test' else X_train_idx
            y = y_test if loader == 'test' else y_train
            # Setting this overrides all future functions.
            cnn.loader = loader
            # pre-train (overfit, not out-of-sample) to entire dataset.
            cnn.fit(X, None, )
            # This next block of code checks if cleanlab works with the CNN
            # Out-of-sample cross-validated holdout predicted probabilities
            np.random.seed(seed)
            # Single epoch for cross-validation (already pre-trained)
            cnn.epochs = 1
            cj, psx = estimate_confident_joint_and_cv_pred_proba(
                X, y, cnn, cv_n_folds=2)
            est_py, est_nm, est_inv = estimate_latent(cj, y)
            # algorithmic identification of label errors
            noise_idx = cleanlab.pruning.get_noise_indices(
                y, psx, est_inv, prune_method=prune_method)
            assert noise_idx is not None

            # Get prediction on loader set.
            pred = cnn.predict(X)
            score = accuracy_score(y, pred)
            print('Acc Before: {:.2f}, After: {:.2f}'.format(prev_score, score))
            assert (score > prev_score)  # Scores should increase

    assert True
        )
        plt.show()
    else:
        print('max_images is too large. Not enough images to display.')

# ## Show the result generalizes for different seed values.

# In[ ]:

# Initialize constants
max_images = 24
np.random.seed(43)
prune_method = 'prune_by_noise_rate'

# Pre-train
cnn = CNN(epochs=15, log_interval=None, loader='train')  #pre-train
cnn.fit(X_train, y_train, loader='train'
        )  # pre-train (overfit, not out-of-sample) to entire dataset.
params = cnn.model.state_dict()  # store CNN's weights after pretraining

cnn.epochs = 1  # Single epoch for cross-validation (already pre-trained)
for seed in range(21, 35):
    np.random.seed(seed)
    cnn.model.load_state_dict(params)
    cj, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba(
        X_train,
        y_train,
        clf=cnn,
    )
    est_py, est_nm, est_inv = cleanlab.latent_estimation.estimate_latent(
        cj, y_train)