def __model_build_noisy(X_train, y_train, X_test, alg, seed): model = GaussianNB() if alg == 'Logistic': model = LogisticRegression(multi_class='auto') clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count()) clf.fit(X_train, y_train) return clf.predict(X_test)
def __model_build_noisy_pseudo(X_train, y_train, X_test, alg, seed): model = GaussianNB() if alg == 'Logistic': model = LogisticRegression(multi_class='auto') clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count()) clf.fit(X_train, y_train) # Pseudo-labelling X_with_noise = X_train[clf.noise_mask] y_train_pseudo = y_train.copy() y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise) y_test_pseudo = clf.predict(X_test) y_pseudo = np.hstack([y_train_pseudo, y_test_pseudo]) X_for_pseudo = np.vstack([X_train, X_test]) model.fit(X_for_pseudo, y_pseudo) return model.predict(X_test)
def train_test_and_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None): model = baseclf(**params) if clf is None: clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count()) clf.fit(X_train, y_train) # trainのcorruptedとtestの両方をpseudoにする X_with_noise = X_train[clf.noise_mask] y_train_pseudo = y_train_corrupted.copy() y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise) y_test_psuedo = clf.predict(X_test) y_pseudo = np.hstack([y_train_pseudo, y_test_psuedo]) X_for_pseudo = sp.vstack([X_train, X_test]) # pseudo込の全データでtrain model.fit(X_for_pseudo, y_pseudo) return model.score(X_test, y_test)
def test_pred_and_pred_proba(): lnl = LearningWithNoisyLabels() lnl.fit(data['X_train'], data['s']) n = np.shape(data['y_test'])[0] m = len(np.unique(data['y_test'])) pred = lnl.predict(data['X_test']) probs = lnl.predict_proba(data['X_test']) # Just check that this functions return what we expect assert (np.shape(pred)[0] == n) assert (np.shape(probs) == (n, m))
def train_noisy_to_pseudo(X_train, y_train, X_test, y_test, clf=None): model = baseclf(**params) if clf is None: clf = LearningWithNoisyLabels(clf=model, seed=seed, n_jobs=cpu_count()) clf.fit(X_train, y_train) # trainのcorruptedにだけpseudo X_with_noise = X_train[clf.noise_mask] y_train_pseudo = y_train_corrupted.copy() y_train_pseudo[clf.noise_mask] = clf.predict(X_with_noise) # きれいにしたtrain dataでtrain model.fit(X_train, y_train_pseudo) return model.score(X_test, y_test)
X_train = X_train.reset_index(drop=True) A_train = A_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) A_test = A_test.reset_index(drop=True) # A_test = A_test.map({ 0:"female", 1:"male"}) # flip across different groups Y_noised = flip(Y_train, A_train, error_rate=error_rate) noise_matrix = generate_noise_matrix(Y_noised, Y_train) est_error_rate = estimation(X_train.values, Y_noised, A_train.values, ngroups=2**args.ngroups) print(f"True error rate is {error_rate}.\nEstimated error rate is {est_error_rate}.") # Learning with Noisy Labels lnl = LearningWithNoisyLabels(clf=LogisticRegression()) lnl.fit(X=X_train.values, s=Y_noised, noise_matrix=noise_matrix) Y_lnlt = lnl.predict(X_train.values).astype(int) lnl.fit(X=X_train.values, s=Y_noised) Y_lnle = lnl.predict(X_train.values).astype(int) def run_corrupt(fairness_constraints): all_results = {} all_results['eps'] = fairness_constraints all_results['accuracy'] = { 'train': [], 'test': [] } all_results['violation'] = { 'train': [], 'test': []
# In[4]: print('WITHOUT confident learning,', end=" ") clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000) _ = clf.fit(X_train, s) pred = clf.predict(X_test) print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2)) print("\nNow we show improvement using cleanlab to characterize the noise") print( "and learn on the data that is (with high confidence) labeled correctly.") print() print('WITH confident learning (noise matrix given),', end=" ") _ = rp.fit(X_train, s, noise_matrix=noise_matrix) pred = rp.predict(X_test) print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2)) print('WITH confident learning (noise / inverse noise matrix given),', end=" ") inv = compute_inv_noise_matrix(py, noise_matrix) _ = rp.fit(X_train, s, noise_matrix=noise_matrix, inverse_noise_matrix=inv) pred = rp.predict(X_test) print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2)) print('WITH confident learning noise not given,', end=" ") clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000) rp = LearningWithNoisyLabels(clf=clf, seed=seed) _ = rp.fit(X_train, s) pred = rp.predict(X_test) print("Iris dataset test accuracy:", round(accuracy_score(pred, y_test), 2))
# original lr f1 print('WITHOUT confident learning,', end=" ") clf.fit(X_train, s) pred = clf.predict(X_test) print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4)) print("\nNow we show improvement using cleanlab to characterize the noise") print( "and learn on the data that is (with high confidence) labeled correctly.") print() print('WITH confident learning (psx not given),', end=" ") rp = LearningWithNoisyLabels(clf=clf) rp.fit(X_train, s) pred = rp.predict(X_test) print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4)) print('WITH confident learning (psx given),', end=" ") rp.fit(X=X_train, s=s, psx=psx) pred = rp.predict(X_test) print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4)) print('WITH all label right,', end=" ") clf.fit(X_train, y_train) pred = clf.predict(X_test) print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4)) print("-------------------") rp_score = f1_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test),
def denoiseA(data_cor, rho, mode): ''' Denoise the corrupted sensitive attribute using RankPrune. ''' rho_a_plus, rho_a_minus = rho dataX = data_cor[0] cor_dataA = data_cor[2] # dataA = data_cor[5] # # auc3, auc4 = None, None noise_matrix = np.array([[1 - rho_a_minus, rho_a_plus], [rho_a_minus, 1 - rho_a_plus]]) # noise_matrix = None lnl = LearningWithNoisyLabels(clf=LogisticRegression( random_state=0, solver='lbfgs', multi_class='auto')) lnl.fit(X=dataX.values, s=cor_dataA.values, noise_matrix=noise_matrix) # Logistic Regression Baseline # lnl = clf=LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto') # lnl.fit(X = dataX.values, y = cor_dataA.values) denoised_dataA = pd.Series(lnl.predict(dataX.values)) data_denoised = copy.deepcopy(data_cor) data_denoised[2] = denoised_dataA # print(lnl.noise_matrix, rho_a_plus, rho_a_minus) # Check recovery accuracy # auc1 = np.mean(dataA.values==cor_dataA.values) # auc2 = np.mean(dataA.values==denoised_dataA.values) # The following is under development. rho_est = None data_denoised_est = None if mode == 'six': lnl2 = LearningWithNoisyLabels( LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')) lnl2.fit(X=dataX.values, s=cor_dataA.values) denoised_dataA_est = pd.Series(lnl2.predict(dataX.values)) data_denoised_est = copy.deepcopy(data_cor) data_denoised_est[2] = denoised_dataA_est rho_a_plus_est = lnl2.noise_matrix[0][1] rho_a_minus_est = lnl2.noise_matrix[1][0] rho_est = [rho_a_plus_est, rho_a_minus_est] # print(lnl2.noise_matrix, rho_a_plus_est, rho_a_minus_est) # lnl3 = LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto') # lnl3.fit(dataX.values, cor_dataA.values) # pred_dataA = pd.Series(lnl3.predict(dataX.values)) # auc3 = np.mean(dataA.values==denoised_dataA_est.values) # auc4 = np.mean(dataA.values==pred_dataA.values) # print('auc:', auc1, auc2, auc3, auc4) return data_denoised, data_denoised_est, rho_est