Beispiel #1
0
def load_compas_dataset():
    """
    Collect the aif360 preprocessed Compas Data Set.
    Charge descriptions are removed.

    :return: The Compas Dataset, split into training and test sets
    """
    dataset = CompasDataset(
        features_to_drop=['c_charge_desc']  # Drop charge description, as they unnecessarily overloads the dataset
    )
    ind = int(len(dataset.instance_names) * 0.8)
    train, test = dataset.split([ind])
    return train, test
Beispiel #2
0
def get_compass(sensitive_feature_name,
                remove_z=False,
                file_path="/home/btd26/datasets/compas/",
                file_name="compas.npy",
                **kwargs):
    z_idx = get_z_idx(sensitive_feature_name, compas_sensitive_features_dict)

    # load file
    file_add = os.path.join(file_path, file_name)
    if os.path.exists(file_add):
        M = np.load("/home/btd26/datasets/compas/compas.npy")
    else:
        from aif360.datasets import CompasDataset
        compas = CompasDataset()
        M = np.concatenate([compas.features, compas.labels], axis=1)
        np.save(file_add, M)
    X = M[:, :-1]
    y = M[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    Xtr, Xts, Ztr, Zts, ytr, yts = extract_z(X_test,
                                             X_train,
                                             y_test,
                                             y_train,
                                             z_idx,
                                             remove_z=remove_z)

    return Xtr, Xts, ytr, yts, Ztr, Zts
Beispiel #3
0
def load_dataset(name):
    if name == 'Adult':
        ds = AdultDataset()
    elif name == 'German':
        ds = GermanDataset()
    elif name == 'Compas':
        ds = CompasDataset()
    return ds, name
def getSmallCompasDataset():

    dataset = CompasDataset()
    dataset_orig = load_preproc_data_compas(['sex'])
    
    features = ['sex', 'race', 'age', 'priors_count', 'c_charge_degree']
    domainArray = getSmallCompasDomain()
    features.append(dataset_orig.label_names[0])

    simpleDomain = Domain(features, domainArray)
    labels = [y[0] for y in dataset_orig.labels]
    
    simpleSamples = dataset_orig.features
    simpleSamples = np.c_[simpleSamples, labels]
    
    return simpleDomain, simpleSamples
def get_data(dataset_used, protected_attribute_used):
    if dataset_used == "adult":
        dataset_orig = AdultDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "german":
        dataset_orig = load_preproc_data_german()
        dataset_orig.labels -= 1
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]

    elif dataset_used == "compas":
        dataset_orig = CompasDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "bank":
        dataset_orig = BankDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    else:
        raise ValueError(f"{dataset_used} is not an available dataset.")

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=101)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=101)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
def getLargeCompasDataset():
    dataset = CompasDataset()
    dataset_orig = load_preproc_data_compas(['sex'])

    (head,types,records)=read_dataset()   
    records = np.array(records)


    head,types = head[:-1], types[:-1]
    records = np.delete(records, -1, axis=1)


    def reorder(record):
        record.pop(11)
        record.append(record.pop(10))
        return record

    records = [reorder(list(record)) for record in records]
    head = reorder(head)

    records = np.array(records)

    def getHotEncoding(index, n):
        temp = [0]*n
        temp[index] = 1
        return tuple(temp)

    races = list(set(records[:, 2]))
    print (races)
    fmo = list(set(records[:, 9]))
    nrecords = []
    domainArray = [set([]) for h in head]
    for record in records:
        temp = []
        for j, (r, h) in enumerate(zip(record, head)):
            if h == "sex":
                if r == 'Male':
                    entry = 1
                else:
                    entry = 0
            elif h == "age":
                # age
                age = int(r)
                if age <= 25:
                    entry = 0
                elif age <=65:
                    entry = 1
                else:
                    entry = 2
            elif h == "race":
                # race
                if races.index(r) == 3:
                    entry = 1
                else: 
                    entry = 0
                #entry = getHotEncoding(races.index(r), len(races))
            elif h == "priors_count":
                # priors count
                priors = int(r)
                if priors <= 0:
                    entry = 0
                elif priors <=10:
                    entry = 1
                elif priors <=20:
                    entry = 2
                elif priors <=30:
                    entry = 3
                elif priors <=40:
                    entry = 4
                else:
                    entry = 5
            elif h == "days_in_jail":
                # months in jail    
                months = int(r)/12.0
                if months <= 0:
                    entry = 0
                elif months <=3:
                    entry = 1
                elif months <=6:
                    entry = 2
                elif months <=12:
                    entry = 3
                elif months <=24:
                    entry = 4
                elif months <=48:
                    entry = 5
                elif months <=60:
                    entry = 6
                else:
                    entry = 7
            elif h == "c_charge_degree":
                entry = fmo.index(r)
            else:
                entry = float(r)
            domainArray[j].add(entry)
            try:
                temp.extend(entry)
            except:
                temp.append(entry)
        nrecords.append(np.array(temp))

    nrecords = np.array(nrecords)

    for index in range(19):
        temp = []
        for record in nrecords:
            temp.append(record[index])

    domainArray = [np.array(list(uvs)) for uvs in domainArray]


    simpleDomain = Domain(head, domainArray)

    return simpleDomain, nrecords
Beispiel #7
0
def test_compas():
    # just test that there are no errors for default loading...
    cd = CompasDataset()
Beispiel #8
0
def LoadData(dataset_name,protected_attribute_name,raw=True):

	optim_options=None

	if dataset_name == "adult":
		if raw:
			dataset_original = AdultDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_adult,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['race'])
			optim_options = {
			"distortion_fun": get_distortion_adult,
			"epsilon": 0.05,
			"clist": [0.99, 1.99, 2.99],
			"dlist": [.1, 0.05, 0]
		}
	elif dataset_name == "german":
		if raw:
			dataset_original = GermanDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "age":
			privileged_groups = [{'age': 1}]
			unprivileged_groups = [{'age': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['age'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		dataset_original.labels = 2 - dataset_original.labels
		dataset_original.unfavorable_label = 0.
	elif dataset_name == "compas":
		if raw:
			dataset_original = CompasDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 0}]
			unprivileged_groups = [{'sex': 1}]
			if not raw:
				dataset_original = load_preproc_data_compas(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_compas(['race'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}

	protected_attribute_set={
		'sex':[[{'sex': 1}],[{'sex': 0}]],
		'age':[[{'age': 1}],[{'age': 0}]],
		'race':[[{'race': 1}],[{'race': 0}]]
	}

	if optim_options==None:
		print('No such dataset & group option:', dataset_name, protected_attribute_name)
		exit()

	return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
Beispiel #9
0
def load_preproc_data_compas(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb
        """

        df = df[[
            'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
            'priors_count', 'days_b_screening_arrest', 'decile_score',
            'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out'
        ]]

        # Indices of data samples to keep
        ix = df['days_b_screening_arrest'] <= 30
        ix = (df['days_b_screening_arrest'] >= -30) & ix
        ix = (df['is_recid'] != -1) & ix
        ix = (df['c_charge_degree'] != "O") & ix
        ix = (df['score_text'] != 'N/A') & ix
        df = df.loc[ix, :]
        df['length_of_stay'] = (
            pd.to_datetime(df['c_jail_out']) -
            pd.to_datetime(df['c_jail_in'])).apply(lambda x: x.days)

        # Restrict races to African-American and Caucasian
        dfcut = df.loc[~df['race'].isin(
            ['Native American', 'Hispanic', 'Asian', 'Other']), :]

        # Restrict the features to use
        dfcutQ = dfcut[[
            'sex', 'race', 'age_cat', 'c_charge_degree', 'score_text',
            'priors_count', 'is_recid', 'two_year_recid', 'length_of_stay'
        ]].copy()

        # Quantize priors count between 0, 1-3, and >3
        def quantizePrior(x):
            if x <= 0:
                return '0'
            elif 1 <= x <= 3:
                return '1 to 3'
            else:
                return 'More than 3'

        # Quantize length of stay
        def quantizeLOS(x):
            if x <= 7:
                return '<week'
            if 8 < x <= 93:
                return '<3months'
            else:
                return '>3 months'

        # Quantize length of stay
        def adjustAge(x):
            if x == '25 - 45':
                return '25 to 45'
            else:
                return x

        # Quantize score_text to MediumHigh
        def quantizeScore(x):
            if (x == 'High') | (x == 'Medium'):
                return 'MediumHigh'
            else:
                return x

        def group_race(x):
            if x == "Caucasian":
                return 1.0
            else:
                return 0.0

        dfcutQ['priors_count'] = dfcutQ['priors_count'].apply(
            lambda x: quantizePrior(x))
        dfcutQ['length_of_stay'] = dfcutQ['length_of_stay'].apply(
            lambda x: quantizeLOS(x))
        dfcutQ['score_text'] = dfcutQ['score_text'].apply(
            lambda x: quantizeScore(x))
        dfcutQ['age_cat'] = dfcutQ['age_cat'].apply(lambda x: adjustAge(x))

        # Recode sex and race
        dfcutQ['sex'] = dfcutQ['sex'].replace({'Female': 1.0, 'Male': 0.0})
        dfcutQ['race'] = dfcutQ['race'].apply(lambda x: group_race(x))

        features = [
            'two_year_recid', 'sex', 'race', 'age_cat', 'priors_count',
            'c_charge_degree'
        ]

        # Pass vallue to df
        df = dfcutQ[features]

        return df

    XD_features = ['age_cat', 'c_charge_degree', 'priors_count', 'sex', 'race']
    D_features = ['sex', 'race'
                  ] if protected_attributes is None else protected_attributes
    Y_features = ['two_year_recid']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = ['age_cat', 'priors_count', 'c_charge_degree']

    # privileged classes
    all_privileged_classes = {"sex": [1.0], "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            0.0: 'Male',
            1.0: 'Female'
        },
        "race": {
            1.0: 'Caucasian',
            0.0: 'Not Caucasian'
        }
    }

    return CompasDataset(
        label_name=Y_features[0],
        favorable_classes=[0],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=[],
        metadata={
            'label_maps': [{
                1.0: 'Did recid.',
                0.0: 'No recid.'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        },
        custom_preprocessing=custom_preprocessing)
from classifiers.utils import all_binaries

#%%
# Specify the dataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_compas
from aif360.datasets import CompasDataset

#%%
TAU = 0.9
TRAIN_SPLIT = 0.7
SENSITIVE_ATTRIBUTE = 'sex'
DOMAIN = [1, 3, 3, 2, 1]

NAME = 'compas_small_{}'.format(SENSITIVE_ATTRIBUTE)

dataset = CompasDataset()
raw_dataset_df = dataset.convert_to_dataframe()[0]
raw_dataset_df['age1'] = (raw_dataset_df['age'] <= 25).astype(int)
raw_dataset_df['age2'] = ((raw_dataset_df['age'] > 25) &
                          (raw_dataset_df['age'] <= 65)).astype(int)
raw_dataset_df['age3'] = (raw_dataset_df['age'] > 65).astype(int)
raw_dataset_df['pior1'] = (raw_dataset_df['prior_count'] <= 0).astype(int)
raw_dataset_df['pior2'] = ((raw_dataset_df['prior_count'] > 0) &
                           (raw_dataset_df['prior_count'] <= 10)).astype(int)
raw_dataset_df['pior3'] = ((raw_dataset_df['prior_count'] > 10) &
                           (raw_dataset_df['prior_count'] <= 20)).astype(int)
raw_dataset_df['pior4'] = ((raw_dataset_df['prior_count'] > 20) &
                           (raw_dataset_df['prior_count'] <= 30)).astype(int)
raw_dataset_df['pior5'] = ((raw_dataset_df['prior_count'] > 30) &
                           (raw_dataset_df['prior_count'] <= 40)).astype(int)
raw_dataset_df['pior6'] = (raw_dataset_df['prior_count'] > 40).astype(int)
Beispiel #11
0
def get_data(dataset, protected_attribute, seed=101):
    def protected_attribute_error():
        raise ValueError(
            f'protected attribute {protected_attribute} is not available for dataset {dataset}'
        )

    if dataset == 'adult':
        from aif360.datasets import AdultDataset
        dataset_orig = AdultDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [2, 3]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'german':
        from aif360.datasets import GermanDataset
        dataset_orig = GermanDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'compas':
        from aif360.datasets import CompasDataset
        dataset_orig = CompasDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 0}]
            unprivileged_groups = [{'sex': 1}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [0, 2]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'bank':
        from aif360.datasets import BankDataset
        dataset_orig = BankDataset()
        if protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    else:
        raise ValueError(f'{dataset} is not an available dataset.')

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6],
                                                             shuffle=True,
                                                             seed=seed)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True,
                                                                  seed=seed)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
    print(dataset.privileged_protected_attributes)
    print('Unprivileged attribute values:')
    print(dataset.unprivileged_protected_attributes)
    print()

    binary_label_metric = BinaryLabelDatasetMetric(
        dataset,
        privileged_groups=privileged_groups,
        unprivileged_groups=unprivileged_groups)
    print(f'Statistical parity difference: '
          f'{binary_label_metric.statistical_parity_difference()}')
    print(f'Disparate impact: {binary_label_metric.disparate_impact()}')

    # classification_metric = ClassificationMetric(
    #     dataset,
    #     privileged_groups=privileged_groups,
    #     unprivileged_groups=unprivileged_groups
    # )
    # print(f'Equal opportunity difference: '
    #       f'{classification_metric.equal_opportunity_difference()}')


if __name__ == '__main__':
    generate_fairness_report(CompasDataset(),
                             privileged_groups=[{
                                 'race': 1
                             }],
                             unprivileged_groups=[{
                                 'race': 0
                             }])