def handle(self, *args, **kwargs): plt.style.use('ggplot') plt.figure(figsize=(6, 6)) TARGET_MODEL = 59 job = Job.objects.filter(pk=TARGET_MODEL)[0] training_df, test_df = get_encoded_logs(job) X_train = training_df.drop(['trace_id', 'label'], 1) RF = DecisionTreeClassifier() Y_train = training_df['label'].values RF.fit(X_train, Y_train) importancies, _ = audit_model(RF.predict, X_train) importancies print(importancies) # generate feature dependence plot fig = plot_dependencies( importancies.median(), reverse_values=False, title="FairML feature dependence plot" ) file_name = "fairml_plot_train_1_3_decision_tree.png" plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=550)
def index(): if request.method == 'GET': # User is requesting the form return render_template('form.html') elif request.method == 'POST': data = pd.read_csv(request.files['datafile']) model_pickle = request.files['modelfile'] model = pickle.load(model_pickle) # call audit model with model importances, _ = audit_model(model.predict, data) # print('printing importances') # print(importances) importances_dict = {} for key, value in dict(importances).items(): importances_dict[key] = np.median(np.array(value)) data_dict = {'importances': importances_dict} data_dict['dataset'] = request.files['datafile'].filename data_dict['model'] = request.files['modelfile'].filename data_dict['public_sharing'] = False document_id = collection.insert(data_dict) features_array = list(importances_dict.keys()) importances_array = list(importances_dict.values()) return render_template('result.html', document_id=document_id, features_array=json.dumps(features_array), importances_array=json.dumps(importances_array))
def get_audit_local(data_prefix, exp_prefix): data_file = './algs/corels/data/%s/local/_auditing_train.csv' % ( data_prefix) scores_file = './algs/corels/data/%s/local/_scores_train.csv' % ( data_prefix) dataset = pd.read_csv(data_file) scores = pd.read_csv(scores_file) models_files = './res_local/%s/%s/%s_corels.mdl' % ( data_prefix, exp_prefix, data_prefix) train_file = './res_local/%s/%s/%s_train.txt' % (data_prefix, exp_prefix, data_prefix) unfairness_results = './res_local/%s/%s/unfairness_train.dump' % ( data_prefix, exp_prefix) accuracy_results = './res_local/%s/%s/accuracy_train.dump' % (data_prefix, exp_prefix) audit_file = './res_local/%s/%s/%s_audit.png' % (data_prefix, exp_prefix, data_prefix) #Récupération modèles adult_mdl = joblib.load(models_files) acc = joblib.load(accuracy_results) diff_metric = joblib.load(unfairness_results) distance = [] for i in range(len(diff_metric)): d = (diff_metric[i])**2 + (1 - acc[i])**2 distance.append(d) idx = np.argmin(distance) y_pred, acc = adult_mdl.predict(train_file) clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(dataset.values, y_pred[:, idx].astype(int)) # call audit model with model total, _ = audit_model(clf.predict, dataset) # generate feature dependence plot fig = plot_dependencies(total.median(), reverse_values=False, title="FairML feature dependence", fig_size=(6, 9)) plt.savefig(audit_file, transparent=False, bbox_inches='tight')
def index(): if request.method == 'GET': # User is requesting the form return render_template('form.html') elif request.method == 'POST': # User has sent us data model = request.files['model'] data = request.files['data'] # call audit model with model importances, _ = audit_model(model.predict, data) # print feature importance print(importances) fig = plot_dependencies( importances.get_compress_dictionary_into_key_median(), reverse_values=False, title="FairML feature dependence") plt.show() return render_template('result.html', message=message)
propublica_data = pd.read_csv( filepath_or_buffer="./doc/example_notebooks/" "propublica_data_for_fairml.csv", sep=",", header=0) # quick processing compas_rating = propublica_data.score_factor.values propublica_data = propublica_data.drop("score_factor", 1) # quick setup of Logistic regression # perhaps use a more crazy classifier clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(propublica_data.values, compas_rating) # double check list of input data columns print(list(propublica_data.columns)) # call total, _ = audit_model( clf.predict, propublica_data, distance_metric="regression", direct_input_pertubation="constant median", number_of_runs=10, include_interactions=False, external_data_set=None ) print(total)
else: adult_rating = adult_data.credit_rating.values adult_data = adult_data.drop("credit_rating", 1) #adult_mdl.predict(adult_data.values) gen_both = True # we fit a quick and dirty logistic regression sklearn # model here. if gen_both: clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(adult_data.values, adult_mdl.predict( adult_data.values)) #adult_mdl.predict(adult_data.values)) # print(clf.predict(adult_data)) # call audit model with model #clf.predict total, _ = audit_model(clf.predict, adult_data) totalbis, _ = audit_model(adult_mdl.predict, adult_data) # print feature importance #print(total) # get corresponding dictionnary featuresDict = total.median() featuresFictBis = totalbis.median() #print(featuresDict) featuresFictBis = correctSign(featuresDict, featuresFictBis) # should be normalized diff = computeDiff(featuresDict, featuresFictBis) featuresDict = cleanFeaturesDict(featuresDict, nbFeat) featuresFictBis = cleanFeaturesDict(featuresFictBis, nbFeat)
plt.style.use('ggplot') plt.figure(figsize=(6, 6)) # read in propublica data propublica_data = pd.read_csv("./doc/example_notebooks/" "propublica_data_for_fairml.csv") # quick data processing compas_rating = propublica_data.score_factor.values propublica_data = propublica_data.drop("score_factor", 1) # quick setup of Logistic regression # perhaps use a more crazy classifier clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(propublica_data.values, compas_rating) # call audit model importancies, _ = audit_model(clf.predict, propublica_data) # print feature importance print(importancies) # generate feature dependence plot fig = plot_dependencies( importancies.median(), reverse_values=False, title="FairML feature dependence logistic regression model") file_name = "fairml_propublica_linear_direct.png" plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)
propublica_data = bankruptcy.drop("Bankrupt", 1) # this is just for demonstration, any classifier or regressor # can be used here. fairml only requires a predict function # to diagnose a black-box model. # we fit a quick and dirty logistic regression sklearn # model here. clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(propublica_data.values, compas_rating) clf.fit(propublica_data.values, compas_rating) # + # call audit model with model total, _ = audit_model(clf.predict, propublica_data) # print feature importance print(total) plt.figure(figsize=(20,10)) # generate feature dependence plot fig = plot_dependencies( total.median(), reverse_values=False, title="FairML feature dependence" ) #plt.savefig("fairml_ldp.eps", transparent=False, bbox_inches='tight') plt # -
accuracy = biasedReg.score(x_test, y_test) print('Sex biased - Cross-predicted Accuracy:', accuracy) #confusion matrix assessment biased model cm_biased = metrics.confusion_matrix(y_test, y_pred_biased) plt.figure(figsize=(9,9)) sns.heatmap(cm_biased, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r'); plt.ylabel('Actual label'); plt.xlabel('Predicted label'); all_sample_title = 'Accuracy Scores: {0}'.format(accuracy) plt.title(all_sample_title, size = 15); #XAI: feature importance importances, _ = audit_model(biasedReg.predict, X) print(importances) plot_dependencies( importances.median(), reverse_values=False, title="Biased - model feature dependence" ) #Apply debiasing mitigation method sex_privileged_groups = [{'sex_Male': 1}] sex_unprivileged_groups = [{'sex_Male': 0}] # Metric for the train dataset
## code from https://blog.fastforwardlabs.com/2017/03/09/fairml-auditing-black-box-predictive-models.html ## only lightly modified import fairml import sklearn import pandas as pd from sklearn.linear_model import LogisticRegression from fairml import audit_model df = pd.read_csv("~/Desktop/german_credit.csv") y = df['class'] df.drop('class', 1, inplace=True) df.drop('Unnamed: 0', 1, inplace=True) clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(df.values, y) importances, _ = audit_model(clf.predict, df) print(importances)
plt.title("ROC curve on test set for models") plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc="lower right") plt.savefig('roc_curve_all_models.png', transparent=False, bbox_inches='tight') plt.clf() ########################### # Now let's audit each model. ########################## for key in classifiers_dict: print("auditing model {}".format(key)) importancies, _ = audit_model(classifiers_dict[key].predict, propublica_data) # generate feature dependence plot _ = plot_dependencies( importancies.median(), reverse_values=False, title="FairML feature dependence for {} model".format(key)) file_name = "{}_feature_dependence_model.png".format(key) plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250) plt.clf()
def run(settings): # Extract Settings exp = settings['title'] m = int(settings['columns']) n = int(settings['samples']) biased = False if settings['biased'] == 'False' else True delta = float(settings['delta']) eps = float(settings['epsilon']) # p_y_A = float(settings['proby']) # p_a = float(settings['proba']) p = float(settings['p']) output_filename = "{}/output/{}_output.csv".format(directory, exp) validation_filename = "{}/validation/{}.csv".format(directory, exp) vf = open(validation_filename, "a") # vf.write('m,n,eps,p_y_A,p_a,p_biased,p_unbiased,x_corr,a_corr\n') vf.write('m,n,delta,eps,p\n') # Keep record of data with open(output_filename, 'w') as f: column_names = ['X{}'.format(str(i)) for i in range(m)] + ['A'] # f.write(','.join(column_names + ['checked']) + '\n') f.write(','.join(column_names) + '\n') for _ in range(num_trials): # Generate Dataset # df = spg.generate_dataset(exp, m, n, biased, eps, p_y_A, p_a, p) # validated = spg.validate_dataset(df) #df = csg.generate_dataset(m, n, biased, eps, delta, p) df = cg.generate_dataset(m, n, biased, delta, p) validated = cg.validate_dataset(df, biased) # checked = check_settings([m, n, eps, p_y_A, p_a, p, biased], validated) vf.write(','.join([str(round(i, 4)) for i in validated]) + '\n') output = df.O.values df = df.drop("O", 1) # quick setup of Logistic regression # perhaps use a more crazy classifier clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(df.values, output) # call audit model importancies, _ = audit_model(clf.predict, df) # f.write(','.join([str(get_repr(importancies[i])) for i in column_names] + [str(checked)]) + '\n') f.write(','.join([str(get_repr(importancies[i])) for i in column_names]) + '\n') results = pd.read_csv(output_filename) exp_name, exp_trial = exp.split("-") results_filename = "{}/results/{}_results.csv".format(directory, exp_name) # validation_results_filename = "{}/results/validation.csv".format(directory, exp_name) # Log in overall experiment # if True in results.checked.values: # checked_true = results.checked.value_counts()[True] # else: # checked_true = 0 # vrf = open(validation_results_filename, "a") # vrf.write("{},{}\n".format(exp, str(checked_true / float(num_trials)))) # results = results.drop("checked", 1) results = results.abs() results['max'] = results.idxmax(axis=1) results_columns = ['delta', 'eps', 'FP'] # param = settings['parameter'] # results_columns = [param, 'FP'] fp_count = results[results['max'] == 'A'].count()['A'] write_header = False if not os.path.exists(results_filename): write_header = True results_file = open(results_filename, "a") if write_header: results_file.write(','.join(results_columns) + '\n') results_file.write(','.join([str(delta), str(eps), str(fp_count)]) + '\n')
a = np.array([[1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1], [3, 1, 2, 7, 4, 5, 2, 6, 2, 3, 4, 5], [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1]], dtype=float) ca = np.swapaxes(a, 0, 1) print("ca") print(ca) cout = bb(ca) print("out") print(cout) ca = pd.DataFrame(ca) imp, dir_imp = audit_model(bb, ca) fig = plot_dependencies(imp.median(), reverse_values=False, title="test_LR_1") file_name = "test_LR_1.png" plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250) fig = plot_dependencies(dir_imp.median(), reverse_values=False, title="test_LR_1 direct perturbation") file_name = "test_LR_1_dir.png" plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250) clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(ca.values, cout)
scores = cross_val_score(logisticRegr, X, Y, cv=5) print('Cross-validated scores', scores) accuracy = logisticRegr.score(x_test, y_test) print('Cross-predicted Accuracy:', accuracy) cm = metrics.confusion_matrix(y_test, y_pred) print(cm) plt.figure(figsize=(9,9)) sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r'); plt.ylabel('Actual label'); plt.xlabel('Predicted label'); all_sample_title = 'Accuracy Scores: {0}'.format(accuracy) plt.title(all_sample_title, size = 15); importances, _ = audit_model(logisticRegr.predict, X) print(importances) plot_dependencies( importances.median(), reverse_values=False, title="Model feature dependence" ) CorrMatrix = df.corr(method="pearson", min_periods=1)
def main(): compas_scores_two_years = pd.read_csv(r"C:\Users\marin\Desktop\UNICAMP\IC\ML-Fairness\fairness\compas_dataset\compas-scores-two-years.csv") compas_scores_two_years = compas_scores_two_years[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']] compas_scores_two_years = compas_scores_two_years[((compas_scores_two_years['days_b_screening_arrest'] <=30) & (compas_scores_two_years['days_b_screening_arrest'] >= -30) & (compas_scores_two_years['is_recid'] != -1) & (compas_scores_two_years['c_charge_degree'] != 'O') & (compas_scores_two_years['score_text'] != 'N/A'))] #X-RAY quant_race = compas_scores_two_years['race'].value_counts() quant_age = compas_scores_two_years['age'].value_counts() quant_score = compas_scores_two_years['score_text'].value_counts() quant_sex = compas_scores_two_years['sex'].value_counts() quant_2yr = compas_scores_two_years['two_year_recid'].value_counts() # print(quant_) tab_scoretext_race = pd.crosstab(compas_scores_two_years['score_text'],compas_scores_two_years['race']) tab_decilescore_race = pd.crosstab(compas_scores_two_years['decile_score'],compas_scores_two_years['race']) # print(tab_decilescore_race) distribution = sns.countplot(x='decile_score', hue='race', data=compas_scores_two_years.loc[(compas_scores_two_years['race'] == 'African-American') | (compas_scores_two_years['race'] == 'Caucasian'),:]) # plt.show() plt.title("Distribution of Decile Scores by Race") plt.xlabel('Decile Score') plt.ylabel('Count') #LOGISTIC REGRESSION categ_cols = ['score_text','age_cat','sex','race','c_charge_degree'] compas_scores_two_years.loc[:,categ_cols] = compas_scores_two_years.loc[:,categ_cols].astype('category') df_Dummies = pd.get_dummies(data = compas_scores_two_years, columns=categ_cols) # Clean column names new_column_names = [col.lstrip().rstrip().lower().replace(" ", "_").replace("-", "_") for col in df_Dummies.columns] df_Dummies.columns = new_column_names # We want another variable that combines Medium and High df_Dummies['score_text_medhi'] = df_Dummies['score_text_medium'] + df_Dummies['score_text_high'] # R-style specification formula = 'score_text_medhi ~ sex_female + age_cat_greater_than_45 + age_cat_less_than_25 + race_african_american + race_asian + race_hispanic + race_native_american + race_other + priors_count + c_charge_degree_m + two_year_recid' score_mod = logit(formula, data = df_Dummies).fit() # print(score_mod.summary()) control = np.exp(-1.5255) / (1 + np.exp(-1.5255)) #Black defendants black = np.exp(0.4772) / (1 - control + (control * np.exp(0.4772))) #Female defendants female = np.exp(0.2213) / (1 - control + (control * np.exp(0.2213))) #Younger than 25 younger = np.exp(1.3084) / (1 - control + (control * np.exp(1.3084))) ##FAIRML propublica_data = pd.read_csv(r"C:\Users\marin\Desktop\UNICAMP\IC\ML-Fairness\fairness\compas_dataset\propublicaCompassRecividism_data_fairml.csv\propublica_data_for_fairml.csv") # create feature and design matrix for model building. compas_rating = propublica_data.score_factor.values propublica_data = propublica_data.drop("score_factor", 1) # Train simple model clf = LogisticRegression(penalty='l2', C=0.01) clf.fit(propublica_data.values, compas_rating) # call audit model with model total, _ = audit_model(clf.predict, propublica_data) # print feature importance print(total) # generate feature dependence plot fig = plot_dependencies( total.get_compress_dictionary_into_key_median(), reverse_values=False, title="FairML feature dependence" ) plt.savefig("fairml_ldp.eps", transparent=False, bbox_inches='tight')