def test_fit_with_inm(
    sparse,
    seed=SEED,
    used_by_another_test=False,
):
    data = SPARSE_DATA if sparse else DATA
    cl = CleanLearning(seed=seed, )
    inm = compute_inv_noise_matrix(
        py=data["py"],
        noise_matrix=data["noise_matrix"],
        ps=data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    cl.fit(data["X_train"], data["labels"], inverse_noise_matrix=inm)
    score_inm = cl.score(data["X_test"], data["true_labels_test"])
    # Learn with noisy labels and estimate the inv noise matrix.
    cl2 = CleanLearning(seed=seed, )
    cl2.fit(
        data["X_train"],
        data["labels"],
    )
    score = cl2.score(data["X_test"], data["true_labels_test"])
    if used_by_another_test:
        return score, score_inm
    else:
        assert score < score_inm + 1e-4
def test_invalid_inputs():
    data = make_data(sparse=False, sizes=[1, 1, 1])
    try:
        test_cl(data)
    except Exception as e:
        assert "Need more data" in str(e)
    else:
        raise Exception("expected test to raise Exception")
    try:
        cl = CleanLearning(
            clf=LogisticRegression(multi_class="auto",
                                   solver="lbfgs",
                                   random_state=SEED),
            find_label_issues_kwargs={
                "return_indices_ranked_by": "self_confidence"
            },
        )
        cl.fit(
            data["X_train"],
            data["labels"],
        )
    except Exception as e:
        assert "not supported" in str(
            e) or "Need more data from each class" in str(e)
    else:
        raise Exception("expected test to raise Exception")
def test_clf_fit_nm_inm(sparse):
    data = SPARSE_DATA if sparse else DATA
    cl = CleanLearning(seed=SEED)
    nm = data["noise_matrix"]
    inm = compute_inv_noise_matrix(
        py=data["py"],
        noise_matrix=nm,
        ps=data["ps"],
    )
    cl.fit(
        X=data["X_train"],
        labels=data["labels"],
        noise_matrix=nm,
        inverse_noise_matrix=inm,
    )
    score_nm_inm = cl.score(data["X_test"], data["true_labels_test"])

    # Learn with noisy labels and estimate the inv noise matrix.
    cl2 = CleanLearning(seed=SEED)
    cl2.fit(
        data["X_train"],
        data["labels"],
    )
    score = cl2.score(data["X_test"], data["true_labels_test"])
    assert score < score_nm_inm + 1e-4
def test_raise_error_no_clf_predict():
    class struct(object):
        def fit(self):
            pass

        def predict_proba(self):
            pass

    try:
        CleanLearning(clf=struct())
    except Exception as e:
        assert "predict" in str(e)
        with pytest.raises(ValueError) as e:
            CleanLearning(clf=struct())
def test_dimN(N):
    cl = CleanLearning(clf=ReshapingLogisticRegression())
    size = [100] + [3 for _ in range(N - 1)]
    X = np.random.normal(size=size)
    labels = np.random.randint(0, 4, size=100)
    # ensure that every class is represented
    labels[0:10] = 0
    labels[11:20] = 1
    labels[21:30] = 2
    labels[31:40] = 3
    # just make sure we don't crash...
    cl.fit(X, labels)
    cl.predict(X)
    cl.predict_proba(X)
    cl.score(X, labels)
def test_no_score(sparse):
    data = SPARSE_DATA if sparse else DATA

    class Struct:
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data["true_labels_test"]

    cl = CleanLearning(clf=Struct())
    score = cl.score(data["X_test"], data["true_labels_test"])
    assert abs(score - 1) < 1e-6
def test_fit_pred_probs(sparse):
    data = SPARSE_DATA if sparse else DATA

    cl = CleanLearning()
    pred_probs = estimate_cv_predicted_probabilities(
        X=data["X_train"],
        labels=data["true_labels_train"],
    )
    cl.fit(X=data["X_train"],
           labels=data["true_labels_train"],
           pred_probs=pred_probs)
    score_with_pred_probs = cl.score(data["X_test"], data["true_labels_test"])
    cl = CleanLearning()
    cl.fit(
        X=data["X_train"],
        labels=data["true_labels_train"],
    )
    score_no_pred_probs = cl.score(data["X_test"], data["true_labels_test"])
    assert abs(score_with_pred_probs - score_no_pred_probs) < 0.01
def test_cj_in_find_label_issues_kwargs(filter_by, seed):
    labels = DATA["labels"]
    num_issues = []
    for provide_confident_joint in [True, False]:
        print(
            f"\nfilter_by: {filter_by} | seed: {seed} | cj_provided: {provide_confident_joint}"
        )
        np.random.seed(seed=seed)
        if provide_confident_joint:
            pred_probs = estimate_cv_predicted_probabilities(X=DATA["X_train"],
                                                             labels=labels,
                                                             seed=seed)
            confident_joint = compute_confident_joint(labels=labels,
                                                      pred_probs=pred_probs)
            cl = CleanLearning(
                find_label_issues_kwargs={
                    "confident_joint": confident_joint,
                    "filter_by": "both",
                    "min_examples_per_class": 1,
                },
                verbose=1,
            )
        else:
            cl = CleanLearning(
                clf=LogisticRegression(random_state=seed),
                find_label_issues_kwargs={
                    "filter_by": "both",
                    "min_examples_per_class": 1,
                },
                verbose=0,
            )
        label_issues_df = cl.find_label_issues(DATA["X_train"], labels=labels)
        label_issues_mask = label_issues_df["is_label_issue"].values
        # Check if the noise matrix was computed based on the passed in confident joint
        cj_reconstruct = (cl.inverse_noise_matrix *
                          np.bincount(DATA["labels"])).T.astype(int)
        np.all(cl.confident_joint == cj_reconstruct)
        num_issues.append(sum(label_issues_mask))

    # Chceck that the same exact number of issues are found regardless if the confident joint
    # is computed during find_label_issues or precomputed and provided as a kwargs parameter.
    assert num_issues[0] == num_issues[1]
def test_score(sparse):
    data = SPARSE_DATA if sparse else DATA
    phrase = "cleanlab is dope"

    class Struct:
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self):
            pass

        def score(self, X, y):
            return phrase

    cl = CleanLearning(clf=Struct())
    score = cl.score(data["X_test"], data["true_labels_test"])
    assert score == phrase
def test_no_fit_sample_weight(sparse):
    data = SPARSE_DATA if sparse else DATA

    class Struct:
        def fit(self, X, y):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data["true_labels_test"]

    n = np.shape(data["true_labels_test"])[0]
    m = len(np.unique(data["true_labels_test"]))
    pred_probs = np.zeros(shape=(n, m))
    cl = CleanLearning(clf=Struct())
    cl.fit(
        data["X_train"],
        data["true_labels_train"],
        pred_probs=pred_probs,
        noise_matrix=data["noise_matrix"],
    )
def test_clf_fit_nm():
    cl = CleanLearning()
    # Example of a bad noise matrix (impossible to learn from)
    nm = np.array([[0, 1], [1, 0]])
    try:
        cl.fit(X=np.arange(3), labels=np.array([0, 0, 1]), noise_matrix=nm)
    except Exception as e:
        assert "Trace(noise_matrix)" in str(e)
        with pytest.raises(ValueError) as e:
            cl.fit(X=np.arange(3), labels=np.array([0, 0, 1]), noise_matrix=nm)
def test_pred_and_pred_proba(sparse):
    data = SPARSE_DATA if sparse else DATA
    cl = CleanLearning()
    cl.fit(data["X_train"], data["labels"])
    n = np.shape(data["true_labels_test"])[0]
    m = len(np.unique(data["true_labels_test"]))
    pred = cl.predict(data["X_test"])
    probs = cl.predict_proba(data["X_test"])
    # Just check that this functions return what we expect
    assert np.shape(pred)[0] == n
    assert np.shape(probs) == (n, m)
def test_sklearn_gridsearchcv():
    # hyper-parameters for grid search
    param_grid = {
        "find_label_issues_kwargs": [
            {
                "filter_by": "prune_by_noise_rate"
            },
            {
                "filter_by": "prune_by_class"
            },
            {
                "filter_by": "both"
            },
            {
                "filter_by": "confident_learning"
            },
            {
                "filter_by": "predicted_neq_given"
            },
        ],
        "converge_latent_estimates": [True, False],
    }

    clf = LogisticRegression(random_state=0,
                             solver="lbfgs",
                             multi_class="auto")

    cv = GridSearchCV(
        estimator=CleanLearning(clf),
        param_grid=param_grid,
        cv=3,
    )

    # cv.fit() raises a warning if some fits fail (including raising
    # exceptions); we don't expect any fits to fail, so ensure that the code
    # doesn't raise any warnings
    with warnings.catch_warnings(record=True) as record:
        cv.fit(X=DATA["X_train"], y=DATA["labels"])
    assert len(record) == 0, "expected no warnings"
def test_cl(data):
    cl = CleanLearning(clf=LogisticRegression(
        multi_class="auto", solver="lbfgs", random_state=SEED))
    cl.fit(data["X_train"], data["labels"])
    score = cl.score(data["X_test"], data["true_labels_test"])
    print(score)
def test_aux_inputs():
    data = DATA
    K = len(np.unique(data["labels"]))
    confident_joint = np.ones(shape=(K, K))
    np.fill_diagonal(confident_joint, 10)
    find_label_issues_kwargs = {
        "confident_joint": confident_joint,
        "min_examples_per_class": 2,
    }
    cl = CleanLearning(
        clf=LogisticRegression(multi_class="auto",
                               solver="lbfgs",
                               random_state=SEED),
        find_label_issues_kwargs=find_label_issues_kwargs,
        verbose=1,
    )
    label_issues_df = cl.find_label_issues(data["X_train"],
                                           data["labels"],
                                           clf_kwargs={})
    assert isinstance(label_issues_df, pd.DataFrame)
    FIND_OUTPUT_COLUMNS = [
        "is_label_issue", "label_quality", "given_label", "predicted_label"
    ]
    assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS
    assert label_issues_df.equals(cl.get_label_issues())
    cl.fit(
        data["X_train"],
        data["labels"],
        label_issues=label_issues_df,
        clf_kwargs={},
        clf_final_kwargs={},
    )
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert list(label_issues_df.columns) == (FIND_OUTPUT_COLUMNS +
                                             ["sample_weight"])
    score = cl.score(data["X_test"], data["true_labels_test"])

    # Test a second fit
    cl.fit(data["X_train"], data["labels"])

    # Test cl.find_label_issues with pred_prob input
    pred_probs_test = cl.predict_proba(data["X_test"])
    label_issues_df = cl.find_label_issues(X=None,
                                           labels=data["true_labels_test"],
                                           pred_probs=pred_probs_test)
    assert isinstance(label_issues_df, pd.DataFrame)
    assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS
    assert label_issues_df.equals(cl.get_label_issues())
    cl.save_space()
    assert cl.label_issues_df is None

    # Verbose off
    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=0)
    cl.save_space()  # dummy call test

    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=0)
    cl.find_label_issues(labels=data["true_labels_test"],
                         pred_probs=pred_probs_test,
                         save_space=True)

    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=1)

    # Test with label_issues_mask input
    label_issues_mask = find_label_issues(
        labels=data["true_labels_test"],
        pred_probs=pred_probs_test,
    )
    cl.fit(data["X_test"],
           data["true_labels_test"],
           label_issues=label_issues_mask)
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS)

    # Test with label_issues_indices input
    label_issues_indices = find_label_issues(
        labels=data["true_labels_test"],
        pred_probs=pred_probs_test,
        return_indices_ranked_by="confidence_weighted_entropy",
    )
    cl.fit(data["X_test"],
           data["true_labels_test"],
           label_issues=label_issues_indices)
    label_issues_df2 = cl.get_label_issues().copy()
    assert isinstance(label_issues_df2, pd.DataFrame)
    assert set(label_issues_df2.columns).issubset(FIND_OUTPUT_COLUMNS)
    assert label_issues_df2["is_label_issue"].equals(
        label_issues_df["is_label_issue"])

    # Test fit() with pred_prob input:
    cl.fit(
        data["X_test"],
        data["true_labels_test"],
        pred_probs=pred_probs_test,
        label_issues=label_issues_mask,
    )
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS)
    assert "label_quality" in label_issues_df.columns
def test_seed():
    cl = CleanLearning(seed=SEED)
    assert cl.seed is not None
def test_default_clf():
    cl = CleanLearning()
    check1 = cl.clf is not None and hasattr(cl.clf, "fit")
    check2 = hasattr(cl.clf, "predict") and hasattr(cl.clf, "predict_proba")
    assert check1 and check2