def complete_multi_task(idx): global data_complete data_sim = gen_complete_random(data_complete, random_ratio=random_ratios[idx], print_all=False) result = test_imputation(data_sim, complete_by_multi, multi=True, verboseID="multi_v1") return result
def complete_multi_v2_task(idx): global data_complete, PARAMS_DATA data_sim = gen_complete_random(data_complete, random_ratio=random_ratios[idx], print_all=False) result = test_imputation(data_sim, partial(complete_by_multi_v2, target_feature=PARAMS_DATA["target"]), multi=True, verboseID="multi_v2") return result
def cross_val(data_original: Dataset, data_config, clf_config, complete_function=None, selected_cols=[]): bias = [] acc = [] smote = SMOTE() scaler = StandardScaler() for i in range(10): if complete_function: data = gen_complete_random(data_original, random_ratio=0.4, selected_cols=selected_cols) else: data = data_original print("Running Cross Validation {}".format(i)) bias_cv = [] acc_cv = [] for train_idx, test_idx in StratifiedShuffleSplit(n_splits=20).split(data.X, data.y): X_train, X_test = data.X.iloc[train_idx].copy(), data.X.iloc[test_idx].copy() Y_train, Y_test = data.y[train_idx], data.y[test_idx] X_train.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) if complete_function: data_incomplete = Dataset("tmp", X_train, Y_train, types=data.types, protected_features=data.protected_features, categorical_features=data.categorical_features, encoders=[data.X_encoders, data.y_encoder]) try: data_complete = complete_function(data_incomplete) except Exception as e: print("Error: {}. Skipped".format(e)) continue if data_complete.X.isnull().sum().sum() > 0: print("Complete function error, skipped") continue X_train = data_complete.X.copy() Y_train = data_complete.y.copy() X_train.drop(columns=data.protected_features, inplace=True) if complete_function: data_incomplete = Dataset("tmp", X_test, Y_test, types=data.types, protected_features=data.protected_features, categorical_features=data.categorical_features, encoders=[data.X_encoders, data.y_encoder]) try: data_complete = complete_function(data_incomplete) except Exception as e: print("Error: {}. Skipped".format(e)) continue if data_complete.X.isnull().sum().sum() > 0: print("Complete function error, skipped") continue X_test = data_complete.X.copy() Y_test = data_complete.y.copy() X_train_res, Y_train_res = smote.fit_resample(X_train, Y_train) X_scaled = scaler.fit_transform(X_train_res) clf = LogisticRegression(max_iter=clf_config["max_iter"], C=clf_config["C"], tol=clf_config["tol"]) clf.fit(X_scaled, Y_train_res) X_test_scaled = pd.DataFrame(scaler.transform(X_test.drop(columns=data.protected_features)), columns=X_test.drop(columns=data.protected_features).columns) X_test_scaled = pd.concat([X_test_scaled, X_test[data.protected_features]], axis=1) X_test_A = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].drop(columns=data.protected_features).to_numpy() X_test_B = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].drop(columns=data.protected_features).to_numpy() Y_test_A = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].index.tolist()] Y_test_B = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].index.tolist()] matrix_A = confusion_matrix(Y_test_A, clf.predict(X_test_A)).ravel().tolist() matrix_B = confusion_matrix(Y_test_B, clf.predict(X_test_B)).ravel().tolist() try: bias_cv.append(newBias(matrix_A+matrix_B)) except Exception as e: print("\tError: {}, skipped".format(e)) acc_cv.append(accuracy_score(clf.predict(X_test_scaled.drop(columns=data.protected_features).to_numpy()), Y_test)) bias.append(np.mean(bias_cv)) acc.append(np.mean(acc_cv)) return (np.mean(bias), np.mean(acc))