def test_cj_in_find_label_issues_kwargs(filter_by, seed): labels = DATA["labels"] num_issues = [] for provide_confident_joint in [True, False]: print( f"\nfilter_by: {filter_by} | seed: {seed} | cj_provided: {provide_confident_joint}" ) np.random.seed(seed=seed) if provide_confident_joint: pred_probs = estimate_cv_predicted_probabilities(X=DATA["X_train"], labels=labels, seed=seed) confident_joint = compute_confident_joint(labels=labels, pred_probs=pred_probs) cl = CleanLearning( find_label_issues_kwargs={ "confident_joint": confident_joint, "filter_by": "both", "min_examples_per_class": 1, }, verbose=1, ) else: cl = CleanLearning( clf=LogisticRegression(random_state=seed), find_label_issues_kwargs={ "filter_by": "both", "min_examples_per_class": 1, }, verbose=0, ) label_issues_df = cl.find_label_issues(DATA["X_train"], labels=labels) label_issues_mask = label_issues_df["is_label_issue"].values # Check if the noise matrix was computed based on the passed in confident joint cj_reconstruct = (cl.inverse_noise_matrix * np.bincount(DATA["labels"])).T.astype(int) np.all(cl.confident_joint == cj_reconstruct) num_issues.append(sum(label_issues_mask)) # Chceck that the same exact number of issues are found regardless if the confident joint # is computed during find_label_issues or precomputed and provided as a kwargs parameter. assert num_issues[0] == num_issues[1]
def test_aux_inputs(): data = DATA K = len(np.unique(data["labels"])) confident_joint = np.ones(shape=(K, K)) np.fill_diagonal(confident_joint, 10) find_label_issues_kwargs = { "confident_joint": confident_joint, "min_examples_per_class": 2, } cl = CleanLearning( clf=LogisticRegression(multi_class="auto", solver="lbfgs", random_state=SEED), find_label_issues_kwargs=find_label_issues_kwargs, verbose=1, ) label_issues_df = cl.find_label_issues(data["X_train"], data["labels"], clf_kwargs={}) assert isinstance(label_issues_df, pd.DataFrame) FIND_OUTPUT_COLUMNS = [ "is_label_issue", "label_quality", "given_label", "predicted_label" ] assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS assert label_issues_df.equals(cl.get_label_issues()) cl.fit( data["X_train"], data["labels"], label_issues=label_issues_df, clf_kwargs={}, clf_final_kwargs={}, ) label_issues_df = cl.get_label_issues() assert isinstance(label_issues_df, pd.DataFrame) assert list(label_issues_df.columns) == (FIND_OUTPUT_COLUMNS + ["sample_weight"]) score = cl.score(data["X_test"], data["true_labels_test"]) # Test a second fit cl.fit(data["X_train"], data["labels"]) # Test cl.find_label_issues with pred_prob input pred_probs_test = cl.predict_proba(data["X_test"]) label_issues_df = cl.find_label_issues(X=None, labels=data["true_labels_test"], pred_probs=pred_probs_test) assert isinstance(label_issues_df, pd.DataFrame) assert list(label_issues_df.columns) == FIND_OUTPUT_COLUMNS assert label_issues_df.equals(cl.get_label_issues()) cl.save_space() assert cl.label_issues_df is None # Verbose off cl = CleanLearning(clf=LogisticRegression(multi_class="auto", solver="lbfgs", random_state=SEED), verbose=0) cl.save_space() # dummy call test cl = CleanLearning(clf=LogisticRegression(multi_class="auto", solver="lbfgs", random_state=SEED), verbose=0) cl.find_label_issues(labels=data["true_labels_test"], pred_probs=pred_probs_test, save_space=True) cl = CleanLearning(clf=LogisticRegression(multi_class="auto", solver="lbfgs", random_state=SEED), verbose=1) # Test with label_issues_mask input label_issues_mask = find_label_issues( labels=data["true_labels_test"], pred_probs=pred_probs_test, ) cl.fit(data["X_test"], data["true_labels_test"], label_issues=label_issues_mask) label_issues_df = cl.get_label_issues() assert isinstance(label_issues_df, pd.DataFrame) assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS) # Test with label_issues_indices input label_issues_indices = find_label_issues( labels=data["true_labels_test"], pred_probs=pred_probs_test, return_indices_ranked_by="confidence_weighted_entropy", ) cl.fit(data["X_test"], data["true_labels_test"], label_issues=label_issues_indices) label_issues_df2 = cl.get_label_issues().copy() assert isinstance(label_issues_df2, pd.DataFrame) assert set(label_issues_df2.columns).issubset(FIND_OUTPUT_COLUMNS) assert label_issues_df2["is_label_issue"].equals( label_issues_df["is_label_issue"]) # Test fit() with pred_prob input: cl.fit( data["X_test"], data["true_labels_test"], pred_probs=pred_probs_test, label_issues=label_issues_mask, ) label_issues_df = cl.get_label_issues() assert isinstance(label_issues_df, pd.DataFrame) assert set(label_issues_df.columns).issubset(FIND_OUTPUT_COLUMNS) assert "label_quality" in label_issues_df.columns