def _load_data(self):
        compas_data = COMPASDataset()

        # use race as the sensitive attribute
        race = compas_data.df['race']
        s = race.where(race == 'Caucasian', 1)
        s.where(s == 1, 0, inplace=True)
        s = s.values.reshape(-1, 1)

        # Use juvenile felonies, juvenile misdemeanors, juvenile others, prior conviction
        x = whiten(data=compas_data.df[[
            'juv_fel_count', 'juv_misd_count', 'juv_other_count',
            'priors_count'
        ]].values.astype(float))

        # Charge Degree categories in one hot encoding
        for category in compas_data.df['c_charge_degree'].unique():
            degree_category = compas_data.df['c_charge_degree'].where(
                compas_data.df['c_charge_degree'] == category, 0)
            degree_category.where(degree_category == 0, 1, inplace=True)
            x = np.hstack((x, degree_category.values.reshape(-1, 1)))

        # use actual recidivisim as target variable
        y = compas_data.df[compas_data.target].values.reshape(-1, 1)

        return x.astype(float), s.astype(float), y.astype(float)
Esempio n. 2
0
def compas():
    """ COMPAS dataset - race as sensitive attributes
    """
    print('Fetching COMPAS data with sensitive_attribute=race ...')

    from responsibly.dataset import COMPASDataset

    compas_ds = COMPASDataset()
    X = compas_ds.df[[
        'sex', 'age', 'c_charge_degree', 'age_cat', 'score_text',
        'priors_count', 'days_b_screening_arrest', 'decile_score',
        'length_of_stay'
    ]]
    X.loc[:, 'length_of_stay'] = X['length_of_stay'].dt.days
    X = X.fillna(0)
    X = pd.get_dummies(X)
    Y = compas_ds.df['is_recid'].values
    A = preprocessing.LabelEncoder().fit_transform(compas_ds.df['race'])
    return X, Y, A
Esempio n. 3
0
    def __init__(self, config_file) :
        f = open(config_file, 'r')
        config = json.loads(f.read())
        self.type = config['dataset']
        _dataset = COMPASDataset() if self.type == 'COMPAS' else GermanDataset()
        self._sensitive_attributes = config["sensitive_attributes"]
        self._non_numeric_attributes = config["non_numeric_attributes"]

        self._n_attributes_dict = config["numeric_attributes"]
        self._numeric_attributes = list(self._n_attributes_dict.keys()) if self._n_attributes_dict else []
        self._date_attributes_dict = config["date_attributes"]
        self._date_attributes = list(self._date_attributes_dict.keys()) if self._date_attributes_dict else []

        self._target = config['ground_truth']
        self._predictions = config['predictions']

        self._data_x_readable = pd.DataFrame(_dataset.df
                                             [
                                                 self._sensitive_attributes+self._non_numeric_attributes+self._numeric_attributes+self._date_attributes])
        self._data_y_readable = pd.DataFrame(_dataset.df[self._target])
        self._encoder_dict_x = defaultdict(LabelEncoder)
        self._encoder_dict_y = defaultdict(LabelEncoder)
        self._data_x, self._data_y = self._preprocess_data()
Esempio n. 4
0
def compas_ds():
    return COMPASDataset()
Esempio n. 5
0
File: data.py Progetto: kwekuka/ot
def compas():
    # Get the whole dataset, already nicely filtered for us from this library
    compas_ds = COMPASDataset()

    # Make the dataframe
    cdf = compas_ds.df
    """
    There are some columns that need to be adjusted, and a bunch that need to be dropped
    - length jail sentence becomes one column instead of c_jail_in and c_jail_out  
    - time in custody becomes one column instead of cusotdy_in and custody_out 
    - I encode binary attributes 0,1 where 0 is majority class 1 is minority class 
    Male => 0 Female => 1, 
    Misdemeanor => 0, Felony => 1 
    """

    # Turn the length of jail sentence a single variable
    c_jail_out = pd.to_datetime(cdf['c_jail_out'])
    c_jail_in = pd.to_datetime(cdf['c_jail_in'])
    c_jail_time = (c_jail_out -
                   c_jail_in).apply(lambda x: x.days + x.seconds / 3600)
    cdf["c_jail_time"] = c_jail_time

    # Turn the length of custody into a single variable
    custody_in = pd.to_datetime(cdf['in_custody'])
    custody_out = pd.to_datetime(cdf['out_custody'])
    custody_delta = (custody_out -
                     custody_in).apply(lambda x: x.days + x.seconds / 3600)
    cdf["custody_length"] = custody_delta

    # Encode Male Female
    cdf = cdf.replace({'sex': {'Male': 0, 'Female': 1}})

    # Encode Charge Degree
    cdf = cdf.replace({'c_charge_degree': {'M': 0, 'F': 1}})

    # One Hot Encode Race
    cdf = one_hot(cdf, "race")

    # Remove Nans (not even sure how those show up for crimes?)
    cdf = cdf.replace({np.nan: "other"})

    charges = cdf["c_charge_desc"].unique()

    # I dropped all of these columns because they didn't seem useful (idk what I was saying earlier)
    # If you disagree just commit it out I guess idrc (this is still true)
    cdf = cdf.drop([
        "name", "id", "dob", "first", "last", "compas_screening_date",
        "age_cat", "c_case_number", "r_case_number", "vr_case_number",
        "decile_score.1", "type_of_assessment", "score_text", "screening_date",
        "v_type_of_assessment", "priors_count.1", "v_score_text",
        "v_screening_date", "in_custody", "out_custody", "length_of_stay",
        "c_jail_out", "c_jail_in", "age_cat", "c_charge_desc",
        "c_offense_date", "c_arrest_date", "c_offense_date", "r_charge_degree",
        "r_days_from_arrest", "r_offense_date", "r_charge_desc", "r_jail_in",
        "r_jail_out", "violent_recid", "vr_charge_degree", "vr_offense_date",
        "score_factor", "vr_charge_desc", "v_decile_score",
        "c_days_from_compas", "start", "end", "event",
        "days_b_screening_arrest"
    ],
                   axis=1)
    return cdf
Esempio n. 6
0
def compas_ds():
    ds = COMPASDataset()
    df = ds.df
    df = df[df['race'].isin(['African-American', 'Caucasian'])]
    ds.df = df
    return ds