Example #1
0
def load_optimpreproc_compas_dataset():
    """
    Collect the Optimized Preprocessed Compas Data Set.

    :return: The Optimized Preprocessed Compas Dataset, split into training and test sets
    """
    dataset = load_preproc_data_compas()
    ind = int(len(dataset.instance_names)*0.8)
    train, test = dataset.split([ind])
    return train, test
def getSmallCompasDataset():

    dataset = CompasDataset()
    dataset_orig = load_preproc_data_compas(['sex'])
    
    features = ['sex', 'race', 'age', 'priors_count', 'c_charge_degree']
    domainArray = getSmallCompasDomain()
    features.append(dataset_orig.label_names[0])

    simpleDomain = Domain(features, domainArray)
    labels = [y[0] for y in dataset_orig.labels]
    
    simpleSamples = dataset_orig.features
    simpleSamples = np.c_[simpleSamples, labels]
    
    return simpleDomain, simpleSamples
Example #3
0
def main():

    # load dataset
    data = load_preproc_data_compas()

    # define priviledged and unpriviledged groups
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]

    # uncomment the following lines to test it with race
    # privileged_groups = [{'race': 1}]
    # unprivileged_groups = [{'race': 0}]

    # set the number of runs for testing
    runs = 10

    # run with reweighing
    accuracy_reweigh, fairness_metrics_reweigh = run(data, runs,
                                                     privileged_groups,
                                                     unprivileged_groups, True)

    # convert to dataframe
    df_accuracy_reweigh = accuracy_dataframe(accuracy_reweigh)
    adversarial_reweigh, prejudice_reweigh, neural_network_reweigh, ensemble_reweigh = fairness_metrics_dataframe(
        runs, fairness_metrics_reweigh)

    # run without reweighing
    accuracy_nonreweigh, fairness_metrics_nonreweigh = run(
        data, runs, privileged_groups, unprivileged_groups, False)

    # convert to dataframe
    df_accuracy_nonreweigh = accuracy_dataframe(accuracy_nonreweigh)
    adversarial_nonreweigh, prejudice_nonreweigh, neural_network_nonreweigh, ensemble_nonreweigh = fairness_metrics_dataframe(
        runs, fairness_metrics_nonreweigh)

    # save output to csv
    save_output(df_accuracy_reweigh, adversarial_reweigh, prejudice_reweigh,
                neural_network_reweigh, ensemble_reweigh,
                df_accuracy_nonreweigh, adversarial_nonreweigh,
                prejudice_nonreweigh, neural_network_nonreweigh,
                ensemble_nonreweigh)
def getLargeCompasDataset():
    dataset = CompasDataset()
    dataset_orig = load_preproc_data_compas(['sex'])

    (head,types,records)=read_dataset()   
    records = np.array(records)


    head,types = head[:-1], types[:-1]
    records = np.delete(records, -1, axis=1)


    def reorder(record):
        record.pop(11)
        record.append(record.pop(10))
        return record

    records = [reorder(list(record)) for record in records]
    head = reorder(head)

    records = np.array(records)

    def getHotEncoding(index, n):
        temp = [0]*n
        temp[index] = 1
        return tuple(temp)

    races = list(set(records[:, 2]))
    print (races)
    fmo = list(set(records[:, 9]))
    nrecords = []
    domainArray = [set([]) for h in head]
    for record in records:
        temp = []
        for j, (r, h) in enumerate(zip(record, head)):
            if h == "sex":
                if r == 'Male':
                    entry = 1
                else:
                    entry = 0
            elif h == "age":
                # age
                age = int(r)
                if age <= 25:
                    entry = 0
                elif age <=65:
                    entry = 1
                else:
                    entry = 2
            elif h == "race":
                # race
                if races.index(r) == 3:
                    entry = 1
                else: 
                    entry = 0
                #entry = getHotEncoding(races.index(r), len(races))
            elif h == "priors_count":
                # priors count
                priors = int(r)
                if priors <= 0:
                    entry = 0
                elif priors <=10:
                    entry = 1
                elif priors <=20:
                    entry = 2
                elif priors <=30:
                    entry = 3
                elif priors <=40:
                    entry = 4
                else:
                    entry = 5
            elif h == "days_in_jail":
                # months in jail    
                months = int(r)/12.0
                if months <= 0:
                    entry = 0
                elif months <=3:
                    entry = 1
                elif months <=6:
                    entry = 2
                elif months <=12:
                    entry = 3
                elif months <=24:
                    entry = 4
                elif months <=48:
                    entry = 5
                elif months <=60:
                    entry = 6
                else:
                    entry = 7
            elif h == "c_charge_degree":
                entry = fmo.index(r)
            else:
                entry = float(r)
            domainArray[j].add(entry)
            try:
                temp.extend(entry)
            except:
                temp.append(entry)
        nrecords.append(np.array(temp))

    nrecords = np.array(nrecords)

    for index in range(19):
        temp = []
        for record in nrecords:
            temp.append(record[index])

    domainArray = [np.array(list(uvs)) for uvs in domainArray]


    simpleDomain = Domain(head, domainArray)

    return simpleDomain, nrecords
Example #5
0
    return row


#%%
TAU = float(sys.argv[1])  # 0.9
SENSITIVE_ATTRIBUTE = str(sys.argv[2])  # 'sex'

#%%
#DOMAIN = [2, 3, 3, 2, 2]
DOMAIN = [1, 3, 3, 1, 1]
EMB_SIZE = 5

#NAME = 'dt_trick2_compas_small_{}_{}.json'.format(SENSITIVE_ATTRIBUTE, TAU)
NAME = 'dt_compas_small_{}_{}.json'.format(SENSITIVE_ATTRIBUTE, TAU)

dataset = load_preproc_data_compas(['sex'])
dataset_df = dataset.convert_to_dataframe()[0]
#
dataset_df = dataset_df.reindex(columns=[
    'sex', 'race', 'age_cat=Less than 25', 'age_cat=25 to 45',
    'age_cat=Greater than 45', 'priors_count=0', 'priors_count=1 to 3',
    'priors_count=More than 3', 'c_charge_degree=F', 'c_charge_degree=M',
    'two_year_recid'
])

# Change to interval threholds
dataset_df.iloc[:, 2:5] = np.apply_along_axis(interval, 1,
                                              dataset_df.iloc[:, 2:5].values)
dataset_df.iloc[:, 5:8] = np.apply_along_axis(interval, 1,
                                              dataset_df.iloc[:, 5:8].values)
Example #6
0
def LoadData(dataset_name,protected_attribute_name,raw=True):

	optim_options=None

	if dataset_name == "adult":
		if raw:
			dataset_original = AdultDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_adult,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['race'])
			optim_options = {
			"distortion_fun": get_distortion_adult,
			"epsilon": 0.05,
			"clist": [0.99, 1.99, 2.99],
			"dlist": [.1, 0.05, 0]
		}
	elif dataset_name == "german":
		if raw:
			dataset_original = GermanDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "age":
			privileged_groups = [{'age': 1}]
			unprivileged_groups = [{'age': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['age'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		dataset_original.labels = 2 - dataset_original.labels
		dataset_original.unfavorable_label = 0.
	elif dataset_name == "compas":
		if raw:
			dataset_original = CompasDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 0}]
			unprivileged_groups = [{'sex': 1}]
			if not raw:
				dataset_original = load_preproc_data_compas(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_compas(['race'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}

	protected_attribute_set={
		'sex':[[{'sex': 1}],[{'sex': 0}]],
		'age':[[{'age': 1}],[{'age': 0}]],
		'race':[[{'race': 1}],[{'race': 0}]]
	}

	if optim_options==None:
		print('No such dataset & group option:', dataset_name, protected_attribute_name)
		exit()

	return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
Example #7
0
def get_data(dataset_used="adult", protected_attribute="sex", train_size=0.7):
    if dataset_used == "adult":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_adult(['sex'])
        else:
            dataset_orig = load_preproc_data_adult(['race'])

    elif dataset_used == "german":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_german(['sex'])
        else:
            dataset_orig = load_preproc_data_german(['age'])

    elif dataset_used == "compas":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_compas(['sex'])
        else:
            dataset_orig = load_preproc_data_compas(['race'])

    #random seed
    np.random.seed(1)

    # Split into train, validation, and test
    dataset_orig_tvt, dataset_orig_ftest = dataset_orig.split([train_size],
                                                              shuffle=True)
    dataset_orig_train, dataset_orig_vt = dataset_orig_tvt.split([train_size],
                                                                 shuffle=True)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True)

    # Convert to dataframe
    df_all, _ = dataset_orig_tvt.convert_to_dataframe()
    df_all = df_all.reset_index(drop=True)
    df_train, _ = dataset_orig_train.convert_to_dataframe()
    df_train = df_train.reset_index(drop=True)
    df_valid, _ = dataset_orig_valid.convert_to_dataframe()
    df_valid = df_valid.reset_index(drop=True)
    df_test, _ = dataset_orig_test.convert_to_dataframe()
    df_test = df_test.reset_index(drop=True)
    df_ftest, _ = dataset_orig_ftest.convert_to_dataframe()
    df_ftest = df_ftest.reset_index(drop=True)

    X_all = df_all.drop(dataset_orig.label_names, axis=1)
    y_all = df_all[dataset_orig.label_names[0]]
    X_train = df_train.drop(dataset_orig.label_names, axis=1)
    y_train = df_train[dataset_orig.label_names[0]]
    X_valid = df_valid.drop(dataset_orig.label_names, axis=1)
    y_valid = df_valid[dataset_orig.label_names[0]]
    X_test = df_test.drop(dataset_orig.label_names, axis=1)
    y_test = df_test[dataset_orig.label_names[0]]
    X_ftest = df_ftest.drop(dataset_orig.label_names, axis=1)
    y_ftest = df_ftest[dataset_orig.label_names[0]]

    # Mab labels to favorable=1 and unfavorable=-1
    favorable = dataset_orig.favorable_label
    unfavorable = dataset_orig.unfavorable_label
    label_map = {favorable: 1, unfavorable: -1}
    y_all = y_all.map(label_map)
    y_train = y_train.map(label_map)
    y_valid = y_valid.map(label_map)
    y_test = y_test.map(label_map)
    y_ftest = y_ftest.map(label_map)

    return X_all, y_all, X_train, y_train, X_valid, y_valid, X_test, y_test, X_ftest, y_ftest
#
# The protected attributes in this dataset are 'sex' and 'race'.
# For this assignment, we'll only focus on race.
#
# The label codes recidivism, which they defined as a new arrest within 2 years.
# Note that in this dataset, the label is coded with 1 being the favorable label.

# %%
get_ipython().system(
    'curl https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv --output compas-scores-two-years.csv'
)
get_ipython().system(
    'mv compas-scores-two-years.csv C:\\Users\\ohund\\Anaconda3\\envs\\fairness\\lib\\site-packages\\aif360\\datasets\\..\\data\\raw\\compas\\compas-scores-two-years.csv'
)

compas_data = load_preproc_data_compas(protected_attributes=['race'])

# %% [markdown]
# Now let's take a look at the data:

# %%
compas_data

# %% [markdown]
# **Creating a train and test split**
#
# We'll create a train (80%) and test split (20%).
#
# Note: *Usually when carrying out machine learning experiments,
# we also need a dev set for developing and selecting our models (incl. tuning of hyper-parameters).
# However, in this assignment, the goal is not to optimize
            columns=['eta', 'acc_avg', 'acc_std', 'sr_avg', 'sr_std'])
        denoised_df.to_excel(writer, sheet_name='denoised')
    return


if __name__ == '__main__':
    start = time.time()

    # input
    protected_name = str(sys.argv[1])
    times = int(sys.argv[2])
    print(protected_name, times)
    sys.stdout.flush()

    # initialization
    dataset = load_preproc_data_compas()
    C = 0
    lam = 0.1

    # learn models and predict
    eta = np.linspace(0.1, 0.4, 7, endpoint=True)
    zvrg_acc = np.zeros([times, len(eta)])
    zvrg_sr = np.zeros([times, len(eta)])
    gyf_acc = np.zeros([times, len(eta)])
    gyf_sr = np.zeros([times, len(eta)])
    undenoised_acc = np.zeros([times, len(eta)])
    undenoised_sr = np.zeros([times, len(eta)])
    denoised_acc = np.zeros([times, len(eta)])
    denoised_sr = np.zeros([times, len(eta)])

    for i in range(times):
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
    
elif dataset_used == "german":
    dataset_orig = GermanDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'age': 1}]
        unprivileged_groups = [{'age': 0}]
    
elif dataset_used == "compas":
#     dataset_orig = CompasDataset()
    dataset_orig = load_preproc_data_compas()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]  
  
#random seed for calibrated equal odds prediction
randseed = 12345679 

#train validation/test split
dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True)

# Placeholder for predicted and transformed datasets
dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)
Example #11
0
if __name__ == "__main__":

    #******************
    # Step 0 - Choose Dataset and classifier type
    data_choice = "adult" # "compas" or "adult"
    classifier_choice = "Logistic Regression" # "SVM" or "Logistic Regression"
    flag_plot = False
    
    #*******************    
    # Step 1 - Get data and plot distribution
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    if data_choice == "adult":
        dataset_orig = load_preproc_data_adult(['sex'])
    else:
        dataset_orig = load_preproc_data_compas(['sex'])
 
    #*******************
    # Step 2 - Split into train and test and normalise
    train, test = dataset_orig.split([0.7], shuffle=True)
    scale_orig = StandardScaler()
    X_train = scale_orig.fit_transform(train.features)
    y_train = train.labels.ravel()
    X_test = scale_orig.transform(test.features)
    y_test = test.labels.ravel()
    test_pred = test.copy()
    plot_distribution(dataset_orig, data_choice)

    #*******************
    # Step 3 - Train machine Learning classifier and plot results
    if classifier_choice == "Logistic Regression":