def create_binary_dataset_sb(): """This will create a binary dataset from the csv with a set salary as the threshold for later predictions. Input - A numeric salary to be set as the threshold Out - A AIF360 binary dataset with one-hot encoded categorical columns """ data = pd.read_csv('../company_x_sb.csv', index_col='employee_id') data_with_label = data.copy() data_with_label['sex'] = data_with_label['sex'].transform(lambda x: x == 'M').astype(int) std_data = StandardDataset(df=data_with_label, label_name='new_signing_bonus', favorable_classes =[1], protected_attribute_names=['sex'], privileged_classes=[[1]], categorical_features=['degree_level', 'dept'], features_to_drop=['boss_id']) df_data = std_data.convert_to_dataframe() binary_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_data[0], label_names=['new_signing_bonus'], protected_attribute_names=['sex']) return binary_dataset
def main(argv): df_data = pd.read_csv(r"adults_dataset/adult_train.csv") df_data = name_columns(df_data) df_test = pd.read_csv(r"adults_dataset/adult_test.csv") df_test = name_columns(df_test) df_data = data_preprocessing(df_data) df_test = data_preprocessing(df_test) # fig_proportion_of_rich(df_test, argv[1], False) df_data_encoded = one_hot_encoding(df_data) df_test_encoded = one_hot_encoding(df_test) normalization(df_data_encoded) normalization(df_test_encoded) samples = split_samples(df_data_encoded, df_test_encoded) model = random_forest_classifier(samples) predictions = predict(model, samples, False) # proportion_of_rich(argv[2], samples, predictions, False) gender_performance(df_test_encoded, predictions) demographic_parity(df_test_encoded, predictions) equalized_odds(df_test_encoded, predictions) #Kamiran and Calders train_sds = StandardDataset(df_data_encoded, label_name="earnings", favorable_classes=[1], protected_attribute_names=["sex"], privileged_classes=[[1]]) test_sds = StandardDataset(df_test_encoded, label_name="earnings", favorable_classes=[1], protected_attribute_names=["sex"], privileged_classes=[[1]]) privileged_groups = [{"sex": 1.0}] unprivileged_groups = [{"sex": 0.0}] RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW.fit(train_sds) test_sds_pred = test_sds.copy(deepcopy=True) test_sds_transf = RW.transform(test_sds) samples_fair = split_samples_fair(train_sds, test_sds, test_sds_pred) model_fair = logistic_regression(test_sds_transf) predictions_fair, test_pred = predict_fair(model_fair, samples_fair, True) test_pred = test_pred.astype(int) dpd = demographic_parity_difference( df_test_encoded.earnings, test_pred, sensitive_features=df_test_encoded.sex) print(f"Model demographic parity difference:", dpd)
def to_dataframe(y_true, y_pred, y_prot): y_true, y_pred, y_prot = y_true.float().cpu().numpy(), y_pred.float( ).cpu().numpy(), y_prot.float().cpu().numpy() df = pd.DataFrame({ 'y_true': y_true, 'y_pred': y_pred, 'y_prot': y_prot }) dataset = StandardDataset(df, 'y_true', [1.], ['y_prot'], [[1.]]) dataset.scores = y_pred.reshape(-1, 1) return dataset
def generate_formatted_dataframe(df, label_name, favorable_classes, protected_attribute_names, privileged_classes, categorical_features, features_to_keep, features_to_drop, na_values, custom_preprocessing, metadata): ''' @usage: to transform the input data into accepted Standard Dataset and split into training, validation (and testing) return: 'standardized' splitted dataframe into training, validation (and testing) @param: - df: original input pandas dataframe ''' # Transform into standardized dataframe dataset_standardized = StandardDataset( df, label_name, favorable_classes, protected_attribute_names, privileged_classes, categorical_features=categorical_features, features_to_keep=features_to_keep, features_to_drop=features_to_drop, na_values=na_values, custom_preprocessing=custom_preprocessing, metadata=metadata) return dataset_standardized
def plot_using_aif(df_predict,df_true): predict_list, true_list = [], [] unpriv_label_list , priv_label_list = [], [] for (u,p) in zip(unpriv_list,priv_list): cur_predict, cur_true = [], [] unpriv_label = '+'.join(['-'.join([prot_attr_dict[key][u_el[key]] for key in u_el]) for u_el in u]) priv_label = '+'.join(['-'.join([prot_attr_dict[key][p_el[key]] for key in p_el]) for p_el in p]) print('-------------------------------------------------------------------') print('unpriv_label:-->',unpriv_label) print('-------------------------------------------------------------------') print('priv_label :-->',priv_label) print('-------------------------------------------------------------------') print('\n\n') for i,label in enumerate(rating_names): #print('Fairness Metric for the label------>',label.upper()) predict_dataset = StandardDataset(df=predict_df_list[i], label_name=label, favorable_classes=[1.0,1.0], protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) true_dataset = StandardDataset(df=true_df_list[i], label_name=label, favorable_classes=[1.0,1.0], protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) predict_dataset_metric = BinaryLabelDatasetMetric(predict_dataset, unprivileged_groups=u, privileged_groups=p) true_dataset_metric = BinaryLabelDatasetMetric(true_dataset, unprivileged_groups=u, privileged_groups=p) #classfication_metric = ClassificationMetric(true_dataset, predict_dataset, unprivileged_groups=u, privileged_groups=p) #x=classfication_metric.generalized_entropy_index() #print(label,': -->','predicted : -->',abs(predict_dataset_metric.disparate_impact()),'true : -->',abs(true_dataset_metric.disparate_impact())) print(label,': -->','predicted : -->',abs(predict_dataset_metric.mean_difference()),'true : -->',abs(true_dataset_metric.mean_difference()))
def preprocess_adultdataset(df): df['Age'] = df['age'].apply(lambda x: x // 10 * 10) df['Education_years'] = df['education.num'].apply( lambda x: '<6' if x <= 5 else ('>12' if x >= 13 else x)) df['Education_years'] = df['Education_years'].astype('category') df['Age'] = df['Age'].apply(lambda x: '>=70' if x >= 70 else x) df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0}) df['race'] = df['race'].apply(lambda x: 1.0 if x == 'White' else 0.0) protected_attribute = ['sex', 'race'] label_name = 'income' categorical_features = ['Age', 'Education_years'] features = categorical_features + [label_name] + protected_attribute privileged_class = {'sex': [1.0], 'race': [1.0]} protected_attribute_map = { 'sex': { 1.0: 'Male', 0.0: 'Female' }, 'race': { 1.0: 'White', 0.0: 'Non-white' } } data = StandardDataset( df, label_name, favorable_classes=['>50K', '>50K.'], protected_attribute_names=protected_attribute, privileged_classes=[privileged_class[x] for x in protected_attribute], categorical_features=categorical_features, features_to_keep=features, metadata={ 'label_maps': [{ 1.0: '>50K', 0.0: '<=50K' }], 'protected_attribute_maps': [protected_attribute_map[x] for x in protected_attribute] }) return data
def main() -> None: # read from inventor filepath = ait_input.get_inventory_path('Data') # prepare column names as given by german.doc column_names = [ 'status', 'month', 'credit_history', 'purpose', 'credit_amount', 'savings', 'employment', 'investment_as_income_percentage', 'personal_status', 'other_debtors', 'residence_since', 'property', 'age', 'installment_plans', 'housing', 'number_of_credits', 'skill_level', 'people_liable_for', 'telephone', 'foreign_worker', 'credit' ] # load into a dataframe df = data_loading(filepath=filepath, column_names=column_names, na_values=None) # prepare for mappings mappings = { 'label_maps': [{ 1.0: 'Good Credit', 2.0: 'Bad Credit' }], 'protected_attribute_maps': [{ 1.0: 'Male', 0.0: 'Female' }, { 1.0: 'Old', 0.0: 'Young' }] } # prepare for categorical features categorical_features = [ 'status', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'installment_plans', 'housing', 'skill_level', 'telephone', 'foreign_worker' ] # load param protected_attribute = ait_input.get_method_param_value( 'protected_attribute') privileged_classes = ait_input.get_method_param_value('privileged_classes') # input check ait_input_check(protected_attribute, privileged_classes) # prepare for structure from dataframe and edit data features setting dataset = StandardDataset( df=df, label_name='credit', favorable_classes=[1], protected_attribute_names=[protected_attribute], privileged_classes=[lambda x: x >= privileged_classes], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=None, features_to_drop=['personal_status', 'sex'], na_values=None, custom_preprocessing=preprocessing, metadata=mappings) # set two variables for the privileged (1) and unprivileged (0) values for the age attribute. privileged_groups = [{protected_attribute: 1}] unprivileged_groups = [{protected_attribute: 0}] # compute fairness metric on original training dataset metric_fairness = BinaryLabelDatasetMetric( dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) print("Original training dataset: German Credit Data") print( "Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_fairness.mean_difference()) print("unprivileged groups = %f" % metric_fairness.base_rate(privileged=False)) print("privileged groups = %f" % metric_fairness.base_rate(privileged=True)) # resource observed_predicted_plot save_metric_fairness_plot(metric_fairness, protected_attribute) # measures measure_mean_difference(metric_fairness.mean_difference()) # ait.log move_log()
def apply_model(self, data, scalers, adjusted_annotated_train_data, pre_processor, learner, model): filtered_data = self.missing_value_handler.handle_missing(data) print(self.missing_value_handler.name(), 'removed', len(data) - len(filtered_data), 'instances from validation data') for numerical_attribute, scaler in scalers.items(): numerical_attribute_data = np.array( filtered_data[numerical_attribute]).reshape(-1, 1) scaled_numerical_attribute_data = scaler.transform( numerical_attribute_data) filtered_data.loc[:, numerical_attribute] = scaled_numerical_attribute_data annotated_data = StandardDataset( df=filtered_data, label_name=self.label_name, favorable_classes=[self.positive_label], protected_attribute_names=self.protected_attribute_names, privileged_classes=self.privileged_classes, categorical_features=self.categorical_attribute_names, features_to_drop=self.attributes_to_drop_names, metadata=self.dataset_metadata) adjusted_annotated_data = self.preprocess_data(pre_processor, annotated_data) train_feature_names = adjusted_annotated_train_data.feature_names current_feature_names = adjusted_annotated_data.feature_names feature_names_in_train_but_not_in_current = set( train_feature_names).difference(set(current_feature_names)) print("Injecting zero columns for features not present", feature_names_in_train_but_not_in_current) validation_data_df, _ = adjusted_annotated_data.convert_to_dataframe() for feature_name in feature_names_in_train_but_not_in_current: validation_data_df.loc[:, feature_name] = 0.0 adjusted_annotated_data.feature_names = train_feature_names adjusted_annotated_data.features = validation_data_df[ train_feature_names].values.copy() adjusted_annotated__data_with_predictions = adjusted_annotated_data.copy( ) if learner.needs_annotated_data_for_prediction(): adjusted_annotated__data_with_predictions = model.predict( adjusted_annotated_data) else: adjusted_annotated__data_with_predictions.labels = model.predict( adjusted_annotated_data.features) try: class_probs = model.predict_proba( adjusted_annotated_data.features) adjusted_annotated__data_with_predictions.scores = class_probs[:, 0] except AttributeError: print("WARNING: MODEL CANNOT ASSIGN CLASS PROBABILITIES") return adjusted_annotated_data, adjusted_annotated__data_with_predictions
def run(self): """Executes all the possible experiments from the combination of given learners, pre-processors and post-processors. No. of experiments = (#learners * #preprocessors * #postprocessors) """ np.random.seed(self.fixed_random_seed) data = self.load_raw_data() all_train_data, test_and_validation_data = train_test_split( data, test_size=self.test_set_ratio + self.validation_set_ratio, random_state=self.fixed_random_seed) train_data = self.train_data_sampler.sample(all_train_data) second_split_ratio = self.test_set_ratio / (self.test_set_ratio + self.validation_set_ratio) validation_data, test_data = train_test_split( test_and_validation_data, test_size=second_split_ratio, random_state=self.fixed_random_seed) self.missing_value_handler.fit(train_data) filtered_train_data = self.missing_value_handler.handle_missing( train_data) print(self.missing_value_handler.name(), 'removed', len(train_data) - len(filtered_train_data), 'instances from training data') scalers = {} for numerical_attribute in self.numeric_attribute_names: numerical_attribute_data = np.array( filtered_train_data[numerical_attribute]).reshape(-1, 1) scaler = clone( self.numeric_attribute_scaler).fit(numerical_attribute_data) scaled_numerical_attribute_data = scaler.transform( numerical_attribute_data) filtered_train_data.loc[:, numerical_attribute] = scaled_numerical_attribute_data scalers[numerical_attribute] = scaler annotated_train_data = StandardDataset( df=filtered_train_data, label_name=self.label_name, favorable_classes=[self.positive_label], protected_attribute_names=self.protected_attribute_names, privileged_classes=self.privileged_classes, categorical_features=self.categorical_attribute_names, features_to_drop=self.attributes_to_drop_names, metadata=self.dataset_metadata) for pre_processor in self.pre_processors: for learner in self.learners: for post_processor in self.post_processors: self.run_single_exp(annotated_train_data, validation_data, test_data, scalers, pre_processor, learner, post_processor) self.filter_optimal_results()
def load_TaiwanDataset(): filepath = "C:\\Users\\Johannes\\Desktop\\Code - Copy\\data\\UCI_Credit_Card.csv" df = pd.read_csv(filepath, sep=',', na_values=[]) df = df.rename(columns={'default.payment.next.month': 'TARGET'}) del df['ID'] df['AGE'] = df['AGE'].apply(lambda x: np.where(x > 25, 1.0, 0.0)) df['CREDIT_AMNT'] = df['BILL_AMT1'] - df['PAY_AMT1'] XD_features = [ "LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "CREDIT_AMNT" ] D_features = ['AGE'] Y_features = ['TARGET'] X_features = list(set(XD_features) - set(D_features)) categorical_features = [ 'SEX', 'EDUCATION', 'MARRIAGE', "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6" ] privileged_class = {"AGE": [1.0]} protected_attribute_map = {"AGE": {1.0: 'Old', 0.0: 'Young'}} def default_preprocessing(df): def label_sex(x): if x == 1: return 'Male' elif x == 2: return 'Female' else: return 'NA' def label_education(x): if x == 1: return 'graduate_school' elif x == 2: return 'university' elif x == 3: return 'high_school' elif x == 4: return 'others' elif x == 5: return 'others' elif x == 6: return 'others' else: return 'others' def label_marriage(x): if x == 1: return 'married' elif x == 2: return 'single' elif x == 3: return 'others' else: return 'others' #to be defined def label_pay(x): if x in [-2, -1]: return 0 else: return x # group credit history, savings, and employment df['SEX'] = df['SEX'].apply(lambda x: label_sex(x)) df['EDUCATION'] = df['EDUCATION'].apply(lambda x: label_education(x)) df['MARRIAGE'] = df['MARRIAGE'].apply(lambda x: label_marriage(x)) pay_col = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"] for p in pay_col: df[p] = df[p].apply(lambda x: label_pay(x)) # Good credit == 1 status_map = {0: 1.0, 1: 2.0} df['TARGET'] = df['TARGET'].replace(status_map) return df df_standard = StandardDataset( df=df, label_name=Y_features[0], favorable_classes=[1], protected_attribute_names=D_features, privileged_classes=[privileged_class["AGE"]], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=X_features + Y_features + D_features, metadata={ 'label_maps': [{ 1.0: 'Good', 2.0: 'Bad' }], 'protected_attribute_maps': [protected_attribute_map] }, custom_preprocessing=default_preprocessing) return df_standard
unpriv_dict = {} priv_dict = {} f_c = [] for i in range(validation_comp.shape[1]): f_c.append([]) if (validation_comp.columns[i].find(unprivileged_group) != -1): prot.append(validation_comp.columns[i]) unpriv_dict = {validation_comp.columns[i]: 1} if (validation_comp.columns[i].find(privileged_group) != -1): priv.append([1]) prot.append(validation_comp.columns[i]) priv_dict = {validation_comp.columns[i]: 1} else: priv.append([]) stdDs = StandardDataset(validation_comp, 'is_violent_recid', [0], prot, priv) stdPred = StandardDataset(validation_pred, 'is_violent_recid', [0], prot, priv) bi_met = BinaryLabelDatasetMetric(stdDs, privileged_groups=[priv_dict], unprivileged_groups=[unpriv_dict]) class_met = ClassificationMetric(stdDs, stdPred, unprivileged_groups=[unpriv_dict], privileged_groups=[priv_dict]) disparate_impact = bi_met.disparate_impact() #error_rate_ratio = class_met.error_rate_ratio() eq_diff = class_met.equal_opportunity_difference() #Create 2 Bar Graphs x = [1]
'-------------------------------------------------------------------') print('unpriv_label:-->', unpriv_label) print( '-------------------------------------------------------------------') print('priv_label :-->', priv_label) print( '-------------------------------------------------------------------') for i, label in enumerate(rating_names): print(label) scaler = MinMaxScaler(copy=False) train_dataset = StandardDataset( df=train_df_list[i], label_name=label, favorable_classes=[1.0, 1.0], protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) test_dataset = StandardDataset( df=test_df_list[i], label_name=label, favorable_classes=[1.0, 1.0], protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) train_dataset.features = scaler.fit_transform(train_dataset.features) test_dataset.features = scaler.fit_transform(test_dataset.features) index = [ test_dataset.feature_names.index(x) for x in protected_attribute_names ]
def prepare_data(data, priv_category, priv_value, target_label, priv_target_value, ignore_cols=None): """ Prepare dataset for bias mitigation. Paramters: data (pandas dataframe): Data to fix (for fairness) priv_category (string): Column name that contains the privileged value (e.g. Race, Gender, etc) priv_value (string): Value or type in the column that denotes the privileged attribute (e.g. White, Male, etc) target_label (string): Column name of target variable (e.g. income, loan score, etc) priv_target_value (string): Value in target that favors the privileged (e.g. High income, favorable loan score, credit acceptance, etc). Must be boolean (so if target is numeric, convert to categorical by thresholding before processing.) ignore_cols, optional (list of string): List of columns to exclude from bias assessment and modeling. Returns: data_priv (standard Dataset): Dataset prepared by aif360 for processing encoders (dict): dictionary of encoding models numerical_features (list): List of numerical columns categorical_features (list) List of categorical columns """ if ignore_cols: data = data.drop(ignore_cols, axis=1) else: pass # Get categorical features categorical_features = data.columns[data.dtypes == 'object'] data_encoded = data.copy() # Store categorical names and encoders categorical_names = {} encoders = {} # Use Label Encoder for categorical columns (including target column) for feature in categorical_features: le = LabelEncoder() le.fit(data_encoded[feature]) data_encoded[feature] = le.transform(data_encoded[feature]) categorical_names[feature] = le.classes_ encoders[feature] = le # Scale numeric columns numerical_features = [ c for c in data.columns.values if c not in categorical_features ] for feature in numerical_features: val = data_encoded[feature].values[:, np.newaxis] mms = MinMaxScaler().fit(val) data_encoded[feature] = mms.transform(val) encoders[feature] = mms data_encoded = data_encoded.astype(float) privileged_class = np.where( categorical_names[priv_category] == priv_value)[0] encoded_target_label = np.where( categorical_names[target_label] == priv_target_value)[0] data_priv = StandardDataset(data_encoded, label_name=target_label, favorable_classes=encoded_target_label, protected_attribute_names=[priv_category], privileged_classes=[privileged_class]) return data_priv, encoders, numerical_features, categorical_features
def train(request): df = pd.read_csv('./training/resume_data_5000.csv') df = df.drop(df.columns[0], axis=1) dataset_orig = StandardDataset(df, label_name='Accepted', favorable_classes=[1], protected_attribute_names=['Gender'], privileged_classes=[[1]], categorical_features=['School'], features_to_drop=['Name']) dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7], shuffle=True) dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True) privileged_groups = [{'Gender': 1}] unprivileged_groups = [{'Gender': 0}] metric_orig_train = BinaryLabelDatasetMetric( dataset_orig_train, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) orig_mean_difference = metric_orig_train.mean_difference() with open('./training/orig_mean_difference.pkl', 'wb') as f: pickle.dump(orig_mean_difference, f) RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) dataset_transf_train = RW.fit_transform(dataset_orig_train) metric_transf_train = BinaryLabelDatasetMetric( dataset_transf_train, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) transf_mean_difference = metric_transf_train.mean_difference() with open('./training/transf_mean_difference.pkl', 'wb') as f: pickle.dump(transf_mean_difference, f) # Logistic regression classifier and predictions scale_orig = StandardScaler() X_train = scale_orig.fit_transform(dataset_orig_train.features) y_train = dataset_orig_train.labels.ravel() w_train = dataset_orig_train.instance_weights.ravel() with open('./training/scaler.pkl', 'wb') as f: pickle.dump(scale_orig, f) lmod_orig = LogisticRegression(solver='lbfgs') lmod_orig.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights) y_train_pred = lmod_orig.predict(X_train) pos_ind = np.where( lmod_orig.classes_ == dataset_orig_train.favorable_label)[0][0] dataset_orig_train_pred = dataset_orig_train.copy() dataset_orig_train_pred.labels = y_train_pred dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True) X_valid = scale_orig.transform(dataset_orig_valid_pred.features) y_valid = dataset_orig_valid_pred.labels dataset_orig_valid_pred.scores = lmod_orig.predict_proba( X_valid)[:, pos_ind].reshape(-1, 1) dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True) X_test = scale_orig.transform(dataset_orig_test_pred.features) y_test = dataset_orig_test_pred.labels dataset_orig_test_pred.scores = lmod_orig.predict_proba( X_test)[:, pos_ind].reshape(-1, 1) num_thresh = 100 ba_arr = np.zeros(num_thresh) class_thresh_arr = np.linspace(0.01, 0.99, num_thresh) for idx, class_thresh in enumerate(class_thresh_arr): fav_inds = dataset_orig_valid_pred.scores > class_thresh dataset_orig_valid_pred.labels[ fav_inds] = dataset_orig_valid_pred.favorable_label dataset_orig_valid_pred.labels[ ~fav_inds] = dataset_orig_valid_pred.unfavorable_label classified_metric_orig_valid = ClassificationMetric( dataset_orig_valid, dataset_orig_valid_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) ba_arr[idx] = 0.5*(classified_metric_orig_valid.true_positive_rate()\ +classified_metric_orig_valid.true_negative_rate()) best_ind = np.where(ba_arr == np.max(ba_arr))[0][0] best_class_thresh = class_thresh_arr[best_ind] bal_acc_arr_orig = [] disp_imp_arr_orig = [] avg_odds_diff_arr_orig = [] for thresh in tqdm(class_thresh_arr): fav_inds = dataset_orig_test_pred.scores > thresh dataset_orig_test_pred.labels[ fav_inds] = dataset_orig_test_pred.favorable_label dataset_orig_test_pred.labels[ ~fav_inds] = dataset_orig_test_pred.unfavorable_label metric_test_bef = compute_metrics(dataset_orig_test, dataset_orig_test_pred, unprivileged_groups, privileged_groups, disp=False) if thresh == best_class_thresh: with open('./training/metrics_orig.pkl', 'wb') as f: pickle.dump(metric_test_bef, f, protocol=pickle.HIGHEST_PROTOCOL) bal_acc_arr_orig.append(metric_test_bef["Balanced accuracy"]) avg_odds_diff_arr_orig.append( metric_test_bef["Average odds difference"]) disp_imp_arr_orig.append(metric_test_bef["Disparate impact"]) scale_transf = StandardScaler() X_train = scale_transf.fit_transform(dataset_transf_train.features) y_train = dataset_transf_train.labels.ravel() lmod_transf = LogisticRegression(solver='lbfgs') lmod_transf.fit(X_train, y_train, sample_weight=dataset_transf_train.instance_weights) y_train_pred = lmod_transf.predict(X_train) dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True) X_test = scale_transf.fit_transform(dataset_transf_test_pred.features) y_test = dataset_transf_test_pred.labels dataset_transf_test_pred.scores = lmod_transf.predict_proba( X_test)[:, pos_ind].reshape(-1, 1) bal_acc_arr_transf = [] disp_imp_arr_transf = [] avg_odds_diff_arr_transf = [] for thresh in tqdm(class_thresh_arr): fav_inds = dataset_transf_test_pred.scores > thresh dataset_transf_test_pred.labels[ fav_inds] = dataset_transf_test_pred.favorable_label dataset_transf_test_pred.labels[ ~fav_inds] = dataset_transf_test_pred.unfavorable_label metric_test_aft = compute_metrics(dataset_orig_test, dataset_transf_test_pred, unprivileged_groups, privileged_groups, disp=False) if thresh == best_class_thresh: with open('./training/metrics_transf.pkl', 'wb') as f: pickle.dump(metric_test_aft, f, protocol=pickle.HIGHEST_PROTOCOL) bal_acc_arr_transf.append(metric_test_aft["Balanced accuracy"]) avg_odds_diff_arr_transf.append( metric_test_aft["Average odds difference"]) disp_imp_arr_transf.append(metric_test_aft["Disparate impact"]) with open('./training/model_orig.pkl', 'wb') as f: pickle.dump(lmod_orig, f) with open('./training/model_transf.pkl', 'wb') as f: pickle.dump(lmod_transf, f) return HttpResponse('Model trained')
def preprocess_germandataset(df): def group_credit_hist(x): if x in [ 'no credits taken/ all credits paid back duly', 'all credits at this bank paid back duly', 'existing credits paid back duly till now' ]: return 'None/Paid' elif x == 'delay in paying off in the past': return 'Delay' elif x == 'critical account/ other credits existing (not at this bank)': return 'Other' else: return 'NA' def group_employ(x): if x == 'unemployed': return 'Unemployed' elif x in ['... < 1 year ', '1 <= ... < 4 years']: return '1-4 years' elif x in ['4 <= ... < 7 years', '.. >= 7 years']: return '4+ years' else: return 'NA' def group_savings(x): if x in ['... < 100 DM', '100 <= ... < 500 DM']: return '<500' elif x in ['500 <= ... < 1000 DM ', '.. >= 1000 DM ']: return '500+' elif x == 'unknown/ no savings account': return 'Unknown/None' else: return 'NA' def group_status(x): if x in ['< 0 DM', '0 <= ... < 200 DM']: return '<200' elif x in ['>= 200 DM / salary assignments for at least 1 year']: return '200+' elif x == 'no checking account': return 'None' else: return 'NA' status_map = { 'male : divorced/separated': 1.0, 'male : single': 1.0, 'male : married/widowed': 1.0, 'female : divorced/separated/married': 0.0, 'female : single': 0.0 } df['personal_status_sex'] = df['personal_status_sex'].replace(status_map) df['credit_history'] = df['credit_history'].apply( lambda x: group_credit_hist(x)) df['savings'] = df['savings'].apply(lambda x: group_savings(x)) df['present_emp_since'] = df['present_emp_since'].apply( lambda x: group_employ(x)) df['age'] = df['age'].apply(lambda x: np.float(x >= 25)) df['account_check_status'] = df['account_check_status'].apply( lambda x: group_status(x)) df = df.rename( columns={ 'default': 'credit', 'present_emp_since': 'employment', 'account_check_status': 'status', 'personal_status_sex': 'sex' }) protected_attribute = ['sex', 'age'] label_name = 'credit' categorical_features = ['credit_history', 'savings', 'employment'] features = categorical_features + [label_name] + protected_attribute privileged_class = {'sex': [1.0], 'age': [1.0]} protected_attribute_map = { "sex": { 1.0: 'male', 0.0: 'female' }, "age": { 1.0: 'old', 0.0: 'young' } } data = StandardDataset( df, label_name, favorable_classes=[1], protected_attribute_names=protected_attribute, privileged_classes=[privileged_class[x] for x in protected_attribute], categorical_features=categorical_features, features_to_keep=features, metadata={ 'label_maps': [{ 1.0: 'Good Credit', 2.0: 'Bad Credit' }], 'protected_attribute_maps': [protected_attribute_map[x] for x in protected_attribute] }) return data
def preprocess_compasdataset(df): df = df[[ 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out' ]] # Indices of data samples to keep ix = df['days_b_screening_arrest'] <= 30 ix = (df['days_b_screening_arrest'] >= -30) & ix ix = (df['is_recid'] != -1) & ix ix = (df['c_charge_degree'] != "O") & ix ix = (df['score_text'] != 'N/A') & ix df = df.loc[ix, :] df['length_of_stay'] = ( pd.to_datetime(df['c_jail_out']) - pd.to_datetime(df['c_jail_in'])).apply(lambda x: x.days) # Restrict races to African-American and Caucasian df = df.loc[ ~df['race'].isin(['Native American', 'Hispanic', 'Asian', 'Other']), :] df = df[[ 'sex', 'race', 'age_cat', 'c_charge_degree', 'score_text', 'priors_count', 'is_recid', 'two_year_recid', 'length_of_stay' ]] df['priors_count'] = df['priors_count'].apply(lambda x: 0 if x <= 0 else ( '1 to 3' if 1 <= x <= 3 else 'More than 3')) df['length_of_stay'] = df['length_of_stay'].apply( lambda x: '<week' if x <= 7 else ('<3months' if 8 < x <= 93 else '>3months')) df['score_text'] = df['score_text'].apply( lambda x: 'MediumHigh' if (x == 'High') | (x == 'Medium') else x) df['age_cat'] = df['age_cat'].apply(lambda x: '25 to 45' if x == '25 - 45' else x) df['sex'] = df['sex'].replace({'Female': 1.0, 'Male': 0.0}) df['race'] = df['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0) df = df[[ 'two_year_recid', 'sex', 'race', 'age_cat', 'priors_count', 'c_charge_degree' ]] protected_attributes = ['sex', 'race'] label_name = 'two_year_recid' categorical_features = ['age_cat', 'priors_count', 'c_charge_degree'] features = categorical_features + [label_name] + protected_attributes # privileged classes privileged_classes = {"sex": [1.0], "race": [1.0]} # protected attribute maps protected_attribute_map = { "sex": { 0.0: 'Male', 1.0: 'Female' }, "race": { 1.0: 'Caucasian', 0.0: 'Not Caucasian' } } data = StandardDataset( df, label_name, favorable_classes=[0], protected_attribute_names=protected_attributes, privileged_classes=[ privileged_classes[x] for x in protected_attributes ], categorical_features=categorical_features, features_to_keep=features, metadata={ 'label_maps': [{ 1.0: 'Did recid.', 0.0: 'No recid.' }], 'protected_attribute_maps': [protected_attribute_map[x] for x in protected_attributes] }) return data