def test_clf_fit_nm_inm(sparse):
    data = SPARSE_DATA if sparse else DATA
    cl = CleanLearning(seed=SEED)
    nm = data["noise_matrix"]
    inm = compute_inv_noise_matrix(
        py=data["py"],
        noise_matrix=nm,
        ps=data["ps"],
    )
    cl.fit(
        X=data["X_train"],
        labels=data["labels"],
        noise_matrix=nm,
        inverse_noise_matrix=inm,
    )
    score_nm_inm = cl.score(data["X_test"], data["true_labels_test"])

    # Learn with noisy labels and estimate the inv noise matrix.
    cl2 = CleanLearning(seed=SEED)
    cl2.fit(
        data["X_train"],
        data["labels"],
    )
    score = cl2.score(data["X_test"], data["true_labels_test"])
    assert score < score_nm_inm + 1e-4
def test_fit_with_inm(
    sparse,
    seed=SEED,
    used_by_another_test=False,
):
    data = SPARSE_DATA if sparse else DATA
    cl = CleanLearning(seed=seed, )
    inm = compute_inv_noise_matrix(
        py=data["py"],
        noise_matrix=data["noise_matrix"],
        ps=data["ps"],
    )
    # Learn with noisy labels with inverse noise matrix given
    cl.fit(data["X_train"], data["labels"], inverse_noise_matrix=inm)
    score_inm = cl.score(data["X_test"], data["true_labels_test"])
    # Learn with noisy labels and estimate the inv noise matrix.
    cl2 = CleanLearning(seed=seed, )
    cl2.fit(
        data["X_train"],
        data["labels"],
    )
    score = cl2.score(data["X_test"], data["true_labels_test"])
    if used_by_another_test:
        return score, score_inm
    else:
        assert score < score_inm + 1e-4
def test_dimN(N):
    cl = CleanLearning(clf=ReshapingLogisticRegression())
    size = [100] + [3 for _ in range(N - 1)]
    X = np.random.normal(size=size)
    labels = np.random.randint(0, 4, size=100)
    # ensure that every class is represented
    labels[0:10] = 0
    labels[11:20] = 1
    labels[21:30] = 2
    labels[31:40] = 3
    # just make sure we don't crash...
    cl.fit(X, labels)
    cl.predict(X)
    cl.predict_proba(X)
    cl.score(X, labels)
def test_fit_pred_probs(sparse):
    data = SPARSE_DATA if sparse else DATA

    cl = CleanLearning()
    pred_probs = estimate_cv_predicted_probabilities(
        X=data["X_train"],
        labels=data["true_labels_train"],
    )
    cl.fit(X=data["X_train"],
           labels=data["true_labels_train"],
           pred_probs=pred_probs)
    score_with_pred_probs = cl.score(data["X_test"], data["true_labels_test"])
    cl = CleanLearning()
    cl.fit(
        X=data["X_train"],
        labels=data["true_labels_train"],
    )
    score_no_pred_probs = cl.score(data["X_test"], data["true_labels_test"])
    assert abs(score_with_pred_probs - score_no_pred_probs) < 0.01
def test_no_score(sparse):
    data = SPARSE_DATA if sparse else DATA

    class Struct:
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self, X):
            return data["true_labels_test"]

    cl = CleanLearning(clf=Struct())
    score = cl.score(data["X_test"], data["true_labels_test"])
    assert abs(score - 1) < 1e-6
def test_score(sparse):
    data = SPARSE_DATA if sparse else DATA
    phrase = "cleanlab is dope"

    class Struct:
        def fit(self):
            pass

        def predict_proba(self):
            pass

        def predict(self):
            pass

        def score(self, X, y):
            return phrase

    cl = CleanLearning(clf=Struct())
    score = cl.score(data["X_test"], data["true_labels_test"])
    assert score == phrase
def test_aux_inputs():
    data = DATA
    K = len(np.unique(data["labels"]))
    confident_joint = np.ones(shape=(K, K))
    np.fill_diagonal(confident_joint, 10)
    find_label_issues_kwargs = {
        "confident_joint": confident_joint,
        "min_examples_per_class": 2,
    }
    cl = CleanLearning(
        clf=LogisticRegression(multi_class="auto",
                               solver="lbfgs",
                               random_state=SEED),
        find_label_issues_kwargs=find_label_issues_kwargs,
        verbose=1,
    )
    label_issues_df = cl.find_label_issues(data["X_train"],
                                           data["labels"],
                                           clf_kwargs={})
    assert isinstance(label_issues_df, pd.DataFrame)
    FIND_OUTPUT_COLUMNS = [
        "is_label_issue", "label_quality", "given_label", "predicted_label"
    ]
    assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS
    assert label_issues_df.equals(cl.get_label_issues())
    cl.fit(
        data["X_train"],
        data["labels"],
        label_issues=label_issues_df,
        clf_kwargs={},
        clf_final_kwargs={},
    )
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert list(label_issues_df.columns) == (FIND_OUTPUT_COLUMNS +
                                             ["sample_weight"])
    score = cl.score(data["X_test"], data["true_labels_test"])

    # Test a second fit
    cl.fit(data["X_train"], data["labels"])

    # Test cl.find_label_issues with pred_prob input
    pred_probs_test = cl.predict_proba(data["X_test"])
    label_issues_df = cl.find_label_issues(X=None,
                                           labels=data["true_labels_test"],
                                           pred_probs=pred_probs_test)
    assert isinstance(label_issues_df, pd.DataFrame)
    assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS
    assert label_issues_df.equals(cl.get_label_issues())
    cl.save_space()
    assert cl.label_issues_df is None

    # Verbose off
    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=0)
    cl.save_space()  # dummy call test

    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=0)
    cl.find_label_issues(labels=data["true_labels_test"],
                         pred_probs=pred_probs_test,
                         save_space=True)

    cl = CleanLearning(clf=LogisticRegression(multi_class="auto",
                                              solver="lbfgs",
                                              random_state=SEED),
                       verbose=1)

    # Test with label_issues_mask input
    label_issues_mask = find_label_issues(
        labels=data["true_labels_test"],
        pred_probs=pred_probs_test,
    )
    cl.fit(data["X_test"],
           data["true_labels_test"],
           label_issues=label_issues_mask)
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS)

    # Test with label_issues_indices input
    label_issues_indices = find_label_issues(
        labels=data["true_labels_test"],
        pred_probs=pred_probs_test,
        return_indices_ranked_by="confidence_weighted_entropy",
    )
    cl.fit(data["X_test"],
           data["true_labels_test"],
           label_issues=label_issues_indices)
    label_issues_df2 = cl.get_label_issues().copy()
    assert isinstance(label_issues_df2, pd.DataFrame)
    assert set(label_issues_df2.columns).issubset(FIND_OUTPUT_COLUMNS)
    assert label_issues_df2["is_label_issue"].equals(
        label_issues_df["is_label_issue"])

    # Test fit() with pred_prob input:
    cl.fit(
        data["X_test"],
        data["true_labels_test"],
        pred_probs=pred_probs_test,
        label_issues=label_issues_mask,
    )
    label_issues_df = cl.get_label_issues()
    assert isinstance(label_issues_df, pd.DataFrame)
    assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS)
    assert "label_quality" in label_issues_df.columns
def test_cl(data):
    cl = CleanLearning(clf=LogisticRegression(
        multi_class="auto", solver="lbfgs", random_state=SEED))
    cl.fit(data["X_train"], data["labels"])
    score = cl.score(data["X_test"], data["true_labels_test"])
    print(score)