Example #1
0
    def handle(self, *args, **kwargs):
        plt.style.use('ggplot')
        plt.figure(figsize=(6, 6))
        TARGET_MODEL = 59
        job = Job.objects.filter(pk=TARGET_MODEL)[0]

        training_df, test_df = get_encoded_logs(job)

        X_train = training_df.drop(['trace_id', 'label'], 1)
        RF = DecisionTreeClassifier()

        Y_train = training_df['label'].values
        RF.fit(X_train, Y_train)

        importancies, _ = audit_model(RF.predict, X_train)
        importancies
        print(importancies)

        # generate feature dependence plot
        fig = plot_dependencies(
            importancies.median(),
            reverse_values=False,
            title="FairML feature dependence plot"
        )

        file_name = "fairml_plot_train_1_3_decision_tree.png"
        plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=550)
Example #2
0
def index():
    if request.method == 'GET':
        # User is requesting the form
        return render_template('form.html')
    elif request.method == 'POST':
        data = pd.read_csv(request.files['datafile'])
        model_pickle = request.files['modelfile']
        model = pickle.load(model_pickle)

        #  call audit model with model
        importances, _ = audit_model(model.predict, data)
        # print('printing importances')
        # print(importances)

        importances_dict = {}
        for key, value in dict(importances).items():
            importances_dict[key] = np.median(np.array(value))

        data_dict = {'importances': importances_dict}
        data_dict['dataset'] = request.files['datafile'].filename
        data_dict['model'] = request.files['modelfile'].filename
        data_dict['public_sharing'] = False

        document_id = collection.insert(data_dict)

        features_array = list(importances_dict.keys())
        importances_array = list(importances_dict.values())

        return render_template('result.html', document_id=document_id, features_array=json.dumps(features_array), 
            importances_array=json.dumps(importances_array))
Example #3
0
def get_audit_local(data_prefix, exp_prefix):

    data_file = './algs/corels/data/%s/local/_auditing_train.csv' % (
        data_prefix)
    scores_file = './algs/corels/data/%s/local/_scores_train.csv' % (
        data_prefix)
    dataset = pd.read_csv(data_file)
    scores = pd.read_csv(scores_file)
    models_files = './res_local/%s/%s/%s_corels.mdl' % (
        data_prefix, exp_prefix, data_prefix)
    train_file = './res_local/%s/%s/%s_train.txt' % (data_prefix, exp_prefix,
                                                     data_prefix)

    unfairness_results = './res_local/%s/%s/unfairness_train.dump' % (
        data_prefix, exp_prefix)
    accuracy_results = './res_local/%s/%s/accuracy_train.dump' % (data_prefix,
                                                                  exp_prefix)
    audit_file = './res_local/%s/%s/%s_audit.png' % (data_prefix, exp_prefix,
                                                     data_prefix)
    #Récupération modèles
    adult_mdl = joblib.load(models_files)
    acc = joblib.load(accuracy_results)
    diff_metric = joblib.load(unfairness_results)
    distance = []
    for i in range(len(diff_metric)):
        d = (diff_metric[i])**2 + (1 - acc[i])**2
        distance.append(d)
    idx = np.argmin(distance)

    y_pred, acc = adult_mdl.predict(train_file)

    clf = LogisticRegression(penalty='l2', C=0.01)
    clf.fit(dataset.values, y_pred[:, idx].astype(int))

    #  call audit model with model
    total, _ = audit_model(clf.predict, dataset)

    # generate feature dependence plot
    fig = plot_dependencies(total.median(),
                            reverse_values=False,
                            title="FairML feature dependence",
                            fig_size=(6, 9))
    plt.savefig(audit_file, transparent=False, bbox_inches='tight')
Example #4
0
def index():
    if request.method == 'GET':
        # User is requesting the form
        return render_template('form.html')
    elif request.method == 'POST':
        # User has sent us data
        model = request.files['model']
        data = request.files['data']

        #  call audit model with model
        importances, _ = audit_model(model.predict, data)

        # print feature importance
        print(importances)
        fig = plot_dependencies(
            importances.get_compress_dictionary_into_key_median(),
            reverse_values=False,
            title="FairML feature dependence")

        plt.show()

        return render_template('result.html', message=message)
Example #5
0
propublica_data = pd.read_csv(
    filepath_or_buffer="./doc/example_notebooks/"
    "propublica_data_for_fairml.csv",
    sep=",",
    header=0)

# quick processing
compas_rating = propublica_data.score_factor.values
propublica_data = propublica_data.drop("score_factor", 1)

#  quick setup of Logistic regression
#  perhaps use a more crazy classifier
clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(propublica_data.values, compas_rating)

#  double check list of input data columns
print(list(propublica_data.columns))

#  call
total, _ = audit_model(
    clf.predict,
    propublica_data,
    distance_metric="regression",
    direct_input_pertubation="constant median",
    number_of_runs=10,
    include_interactions=False,
    external_data_set=None
)

print(total)
Example #6
0
else:
    adult_rating = adult_data.credit_rating.values
    adult_data = adult_data.drop("credit_rating", 1)

#adult_mdl.predict(adult_data.values)
gen_both = True
# we fit a quick and dirty logistic regression sklearn
# model here.
if gen_both:
    clf = LogisticRegression(penalty='l2', C=0.01)
    clf.fit(adult_data.values, adult_mdl.predict(
        adult_data.values))  #adult_mdl.predict(adult_data.values))
    # print(clf.predict(adult_data))
    #  call audit model with model
    #clf.predict
    total, _ = audit_model(clf.predict, adult_data)
    totalbis, _ = audit_model(adult_mdl.predict, adult_data)
    # print feature importance
    #print(total)

    # get corresponding dictionnary
    featuresDict = total.median()
    featuresFictBis = totalbis.median()
    #print(featuresDict)
    featuresFictBis = correctSign(featuresDict, featuresFictBis)

    # should be normalized
    diff = computeDiff(featuresDict, featuresFictBis)

    featuresDict = cleanFeaturesDict(featuresDict, nbFeat)
    featuresFictBis = cleanFeaturesDict(featuresFictBis, nbFeat)
Example #7
0
plt.style.use('ggplot')
plt.figure(figsize=(6, 6))

# read in propublica data
propublica_data = pd.read_csv("./doc/example_notebooks/"
                              "propublica_data_for_fairml.csv")

# quick data processing
compas_rating = propublica_data.score_factor.values
propublica_data = propublica_data.drop("score_factor", 1)

#  quick setup of Logistic regression
#  perhaps use a more crazy classifier
clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(propublica_data.values, compas_rating)

#  call audit model
importancies, _ = audit_model(clf.predict, propublica_data)

# print feature importance
print(importancies)

# generate feature dependence plot
fig = plot_dependencies(
    importancies.median(),
    reverse_values=False,
    title="FairML feature dependence logistic regression model")

file_name = "fairml_propublica_linear_direct.png"
plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)
Example #8
0
propublica_data = bankruptcy.drop("Bankrupt", 1)


# this is just for demonstration, any classifier or regressor
# can be used here. fairml only requires a predict function
# to diagnose a black-box model.

# we fit a quick and dirty logistic regression sklearn
# model here.
clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(propublica_data.values, compas_rating)
clf.fit(propublica_data.values, compas_rating)

# +
#  call audit model with model
total, _ = audit_model(clf.predict, propublica_data)

# print feature importance
print(total)

plt.figure(figsize=(20,10))

# generate feature dependence plot
fig = plot_dependencies(
    total.median(),
    reverse_values=False,
    title="FairML feature dependence"
)
#plt.savefig("fairml_ldp.eps", transparent=False, bbox_inches='tight')
plt
# -
accuracy = biasedReg.score(x_test, y_test)
print('Sex biased - Cross-predicted Accuracy:', accuracy)

#confusion matrix assessment biased model
cm_biased = metrics.confusion_matrix(y_test, y_pred_biased)

plt.figure(figsize=(9,9))
sns.heatmap(cm_biased, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Scores: {0}'.format(accuracy)
plt.title(all_sample_title, size = 15);


#XAI: feature importance 
importances, _ = audit_model(biasedReg.predict, X)

print(importances)

plot_dependencies(
    importances.median(),
    reverse_values=False,
    title="Biased - model feature dependence"
)


#Apply debiasing mitigation method
sex_privileged_groups = [{'sex_Male': 1}]
sex_unprivileged_groups = [{'sex_Male': 0}]

# Metric for the train dataset
Example #10
0
## code from https://blog.fastforwardlabs.com/2017/03/09/fairml-auditing-black-box-predictive-models.html
## only lightly modified

import fairml
import sklearn
import pandas as pd

from sklearn.linear_model import LogisticRegression
from fairml import audit_model

df = pd.read_csv("~/Desktop/german_credit.csv")
y = df['class']
df.drop('class', 1, inplace=True)
df.drop('Unnamed: 0', 1, inplace=True)

clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(df.values, y)

importances, _ = audit_model(clf.predict, df)
print(importances)
plt.title("ROC curve on test set for models")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('roc_curve_all_models.png', transparent=False, bbox_inches='tight')
plt.clf()

###########################

# Now let's audit each model.

##########################
for key in classifiers_dict:

    print("auditing model {}".format(key))

    importancies, _ = audit_model(classifiers_dict[key].predict,
                                  propublica_data)

    # generate feature dependence plot
    _ = plot_dependencies(
        importancies.median(),
        reverse_values=False,
        title="FairML feature dependence for {} model".format(key))

    file_name = "{}_feature_dependence_model.png".format(key)
    plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)
    plt.clf()
Example #12
0
def run(settings):
  # Extract Settings
  exp = settings['title']
  m = int(settings['columns'])
  n = int(settings['samples'])
  biased = False if settings['biased'] == 'False' else True
  delta = float(settings['delta'])
  eps = float(settings['epsilon'])
  # p_y_A = float(settings['proby'])
  # p_a = float(settings['proba'])
  p = float(settings['p'])

  output_filename = "{}/output/{}_output.csv".format(directory, exp)
  validation_filename = "{}/validation/{}.csv".format(directory, exp)
  vf = open(validation_filename, "a")
  # vf.write('m,n,eps,p_y_A,p_a,p_biased,p_unbiased,x_corr,a_corr\n')
  vf.write('m,n,delta,eps,p\n')

  # Keep record of data
  with open(output_filename, 'w') as f:
    column_names = ['X{}'.format(str(i)) for i in range(m)] + ['A']
    # f.write(','.join(column_names + ['checked']) + '\n')
    f.write(','.join(column_names) + '\n')

    for _ in range(num_trials):
      # Generate Dataset
      # df = spg.generate_dataset(exp, m, n, biased, eps, p_y_A, p_a, p)
      # validated = spg.validate_dataset(df)
      #df = csg.generate_dataset(m, n, biased, eps, delta, p)
      df = cg.generate_dataset(m, n, biased, delta, p)
      validated = cg.validate_dataset(df, biased)
      # checked = check_settings([m, n, eps, p_y_A, p_a, p, biased], validated)
      vf.write(','.join([str(round(i, 4)) for i in validated]) + '\n')

      output = df.O.values
      df = df.drop("O", 1)

      #  quick setup of Logistic regression
      #  perhaps use a more crazy classifier
      clf = LogisticRegression(penalty='l2', C=0.01)
      clf.fit(df.values, output)
      #  call audit model
      importancies, _ = audit_model(clf.predict, df)

      # f.write(','.join([str(get_repr(importancies[i])) for i in column_names] + [str(checked)]) + '\n')
      f.write(','.join([str(get_repr(importancies[i])) for i in column_names]) + '\n')
  results = pd.read_csv(output_filename)
  exp_name, exp_trial = exp.split("-")
  results_filename = "{}/results/{}_results.csv".format(directory, exp_name)
  # validation_results_filename = "{}/results/validation.csv".format(directory, exp_name)

  # Log in overall experiment
  # if True in results.checked.values:
  #   checked_true = results.checked.value_counts()[True]
  # else:
  #   checked_true = 0

  # vrf = open(validation_results_filename, "a")
  # vrf.write("{},{}\n".format(exp, str(checked_true / float(num_trials))))

  # results = results.drop("checked", 1)
  results = results.abs()
  results['max'] = results.idxmax(axis=1)

  results_columns = ['delta', 'eps', 'FP']
  # param = settings['parameter']
  # results_columns = [param, 'FP']
  fp_count = results[results['max'] == 'A'].count()['A']

  write_header = False
  if not os.path.exists(results_filename):
    write_header = True
  results_file = open(results_filename, "a")

  if write_header:
    results_file.write(','.join(results_columns) + '\n')

  results_file.write(','.join([str(delta), str(eps), str(fp_count)]) + '\n')
Example #13
0
a = np.array([[1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
              [3, 1, 2, 7, 4, 5, 2, 6, 2, 3, 4, 5],
              [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1]],
             dtype=float)
ca = np.swapaxes(a, 0, 1)

print("ca")
print(ca)

cout = bb(ca)

print("out")
print(cout)

ca = pd.DataFrame(ca)
imp, dir_imp = audit_model(bb, ca)

fig = plot_dependencies(imp.median(), reverse_values=False, title="test_LR_1")

file_name = "test_LR_1.png"
plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)

fig = plot_dependencies(dir_imp.median(),
                        reverse_values=False,
                        title="test_LR_1 direct perturbation")

file_name = "test_LR_1_dir.png"
plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)

clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(ca.values, cout)
Example #14
0

scores = cross_val_score(logisticRegr, X, Y, cv=5)
print('Cross-validated scores', scores)

accuracy = logisticRegr.score(x_test, y_test)
print('Cross-predicted Accuracy:', accuracy)

cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Scores: {0}'.format(accuracy)
plt.title(all_sample_title, size = 15);


importances, _ = audit_model(logisticRegr.predict, X)

print(importances)

plot_dependencies(
    importances.median(),
    reverse_values=False,
    title="Model feature dependence"
)


CorrMatrix = df.corr(method="pearson", min_periods=1)
Example #15
0
def main():
    compas_scores_two_years = pd.read_csv(r"C:\Users\marin\Desktop\UNICAMP\IC\ML-Fairness\fairness\compas_dataset\compas-scores-two-years.csv")

    compas_scores_two_years = compas_scores_two_years[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 
             'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']]

    compas_scores_two_years = compas_scores_two_years[((compas_scores_two_years['days_b_screening_arrest'] <=30) & 
      (compas_scores_two_years['days_b_screening_arrest'] >= -30) &
      (compas_scores_two_years['is_recid'] != -1) &
      (compas_scores_two_years['c_charge_degree'] != 'O') & 
      (compas_scores_two_years['score_text'] != 'N/A'))]

    #X-RAY
    quant_race = compas_scores_two_years['race'].value_counts()
    quant_age = compas_scores_two_years['age'].value_counts()
    quant_score = compas_scores_two_years['score_text'].value_counts()
    quant_sex = compas_scores_two_years['sex'].value_counts()
    quant_2yr = compas_scores_two_years['two_year_recid'].value_counts()
    # print(quant_)

    tab_scoretext_race = pd.crosstab(compas_scores_two_years['score_text'],compas_scores_two_years['race'])
    tab_decilescore_race = pd.crosstab(compas_scores_two_years['decile_score'],compas_scores_two_years['race'])
    # print(tab_decilescore_race)

    distribution = sns.countplot(x='decile_score', hue='race', data=compas_scores_two_years.loc[(compas_scores_two_years['race'] == 'African-American') | (compas_scores_two_years['race'] == 'Caucasian'),:])
    # plt.show()
    plt.title("Distribution of Decile Scores by Race")
    plt.xlabel('Decile Score')
    plt.ylabel('Count')

    #LOGISTIC REGRESSION
    categ_cols = ['score_text','age_cat','sex','race','c_charge_degree']
    compas_scores_two_years.loc[:,categ_cols] = compas_scores_two_years.loc[:,categ_cols].astype('category')

    df_Dummies = pd.get_dummies(data = compas_scores_two_years, columns=categ_cols)

    # Clean column names
    new_column_names = [col.lstrip().rstrip().lower().replace(" ", "_").replace("-", "_") for col in df_Dummies.columns]
    df_Dummies.columns = new_column_names

    # We want another variable that combines Medium and High
    df_Dummies['score_text_medhi'] = df_Dummies['score_text_medium'] + df_Dummies['score_text_high']
    
    # R-style specification
    formula = 'score_text_medhi ~ sex_female + age_cat_greater_than_45 + age_cat_less_than_25 + race_african_american + race_asian + race_hispanic + race_native_american + race_other + priors_count + c_charge_degree_m + two_year_recid'

    score_mod = logit(formula, data = df_Dummies).fit()
    # print(score_mod.summary())

    control = np.exp(-1.5255) / (1 + np.exp(-1.5255))
    #Black defendants
    black = np.exp(0.4772) / (1 - control + (control * np.exp(0.4772)))
    #Female defendants
    female = np.exp(0.2213) / (1 - control + (control * np.exp(0.2213)))
    #Younger than 25 
    younger = np.exp(1.3084) / (1 - control + (control * np.exp(1.3084)))

    ##FAIRML
    propublica_data = pd.read_csv(r"C:\Users\marin\Desktop\UNICAMP\IC\ML-Fairness\fairness\compas_dataset\propublicaCompassRecividism_data_fairml.csv\propublica_data_for_fairml.csv")

    # create feature and design matrix for model building.
    compas_rating = propublica_data.score_factor.values
    propublica_data = propublica_data.drop("score_factor", 1)

    # Train simple model
    clf = LogisticRegression(penalty='l2', C=0.01)
    clf.fit(propublica_data.values, compas_rating)

    #  call audit model with model
    total, _ = audit_model(clf.predict, propublica_data)

    # print feature importance
    print(total)

    # generate feature dependence plot
    fig = plot_dependencies(
        total.get_compress_dictionary_into_key_median(),
        reverse_values=False,
        title="FairML feature dependence"
    )
    plt.savefig("fairml_ldp.eps", transparent=False, bbox_inches='tight')