Ejemplo n.º 1
0
def fetch_lawschool_gpa(subset="all",
                        usecols=[],
                        dropcols=[],
                        numeric_only=False,
                        dropna=False):
    """Load the Law School GPA dataset

    Note:
        By default, the data is downloaded from tempeh. See
        https://github.com/microsoft/tempeh for details.

    Args:
        subset ({'train', 'test', or 'all'}, optional): Select the dataset to
            load: 'train' for the training set, 'test' for the test set, 'all'
            for both.
        usecols (single label or list-like, optional): Feature column(s) to
            keep. All others are dropped.
        dropcols (single label or list-like, optional): Feature column(s) to
            drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        namedtuple: Tuple containing X, y, and sample_weights for the Law School
        GPA dataset accessible by index or name.
    """
    if subset not in {'train', 'test', 'all'}:
        raise ValueError("subset must be either 'train', 'test', or 'all'; "
                         "cannot be {}".format(subset))

    dataset = tc.datasets["lawschool_gpa"]()
    X_train, X_test = dataset.get_X(format=pd.DataFrame)
    y_train, y_test = dataset.get_y(format=pd.Series)
    A_train, A_test = dataset.get_sensitive_features(name='race',
                                                     format=pd.Series)
    all_train = pd.concat([X_train, y_train, A_train], axis=1)
    all_test = pd.concat([X_test, y_test, A_test], axis=1)

    if subset == "train":
        df = all_train
    elif subset == "test":
        df = all_test
    else:
        df = pd.concat([all_train, all_test], axis=0)

    return standardize_dataset(df,
                               prot_attr=['race'],
                               target='zfygpa',
                               usecols=usecols,
                               dropcols=dropcols,
                               numeric_only=numeric_only,
                               dropna=dropna)
Ejemplo n.º 2
0
def fetch_adult(subset='all',
                data_home=None,
                binary_race=True,
                usecols=[],
                dropcols=[],
                numeric_only=False,
                dropna=True):
    """Load the Adult Census Income Dataset.

    Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). The
    other protected attribute is 'sex' ('Male' is privileged and 'Female' is
    unprivileged). The outcome variable is 'annual-income': '>50K' (favorable)
    or '<=50K' (unfavorable).

    Note:
        By default, the data is downloaded from OpenML. See the `adult
        <https://www.openml.org/d/1590>`_ page for details.

    Args:
        subset ({'train', 'test', or 'all'}, optional): Select the dataset to
            load: 'train' for the training set, 'test' for the test set, 'all'
            for both.
        data_home (string, optional): Specify another download and cache folder
            for the datasets. By default all AIF360 datasets are stored in
            'aif360/sklearn/data/raw' subfolders.
        binary_race (bool, optional): Group all non-white races together.
        usecols (single label or list-like, optional): Feature column(s) to
            keep. All others are dropped.
        dropcols (single label or list-like, optional): Feature column(s) to
            drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        namedtuple: Tuple containing X, y, and sample_weights for the Adult
        dataset accessible by index or name.

    See also:
        :func:`sklearn.datasets.fetch_openml`

    Examples:
        >>> adult = fetch_adult()
        >>> adult.X.shape
        (45222, 13)

        >>> adult_num = fetch_adult(numeric_only=True)
        >>> adult_num.X.shape
        (48842, 5)
    """
    if subset not in {'train', 'test', 'all'}:
        raise ValueError("subset must be either 'train', 'test', or 'all'; "
                         "cannot be {}".format(subset))
    df = to_dataframe(
        fetch_openml(data_id=1590,
                     target_column=None,
                     data_home=data_home or DATA_HOME_DEFAULT))
    if subset == 'train':
        df = df.iloc[16281:]
    elif subset == 'test':
        df = df.iloc[:16281]

    df = df.rename(columns={'class': 'annual-income'})  # more descriptive name
    df['annual-income'] = df['annual-income'].cat.as_ordered(
    )  # '<=50K' < '>50K'

    # binarize protected attributes
    if binary_race:
        df.race = df.race.cat.set_categories(['Non-white', 'White'],
                                             ordered=True).fillna('Non-white')
    df.sex = df.sex.cat.as_ordered()  # 'Female' < 'Male'

    return standardize_dataset(df,
                               prot_attr=['race', 'sex'],
                               target='annual-income',
                               sample_weight='fnlwgt',
                               usecols=usecols,
                               dropcols=dropcols,
                               numeric_only=numeric_only,
                               dropna=dropna)
Ejemplo n.º 3
0
def fetch_bank(data_home=None,
               percent10=False,
               usecols=[],
               dropcols='duration',
               numeric_only=False,
               dropna=False):
    """Load the Bank Marketing Dataset.

    The protected attribute is 'age' (left as continuous). The outcome variable
    is 'deposit': 'yes' or 'no'.

    Note:
        By default, the data is downloaded from OpenML. See the `bank-marketing
        <https://www.openml.org/d/1461>`_ page for details.

    Args:
        data_home (string, optional): Specify another download and cache folder
            for the datasets. By default all AIF360 datasets are stored in
            'aif360/sklearn/data/raw' subfolders.
        percent10 (bool, optional): Download the reduced version (10% of data).
        usecols (single label or list-like, optional): Column name(s) to keep.
            All others are dropped.
        dropcols (single label or list-like, optional): Column name(s) to drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs. Note: this is False by default for
            this dataset.

    Returns:
        namedtuple: Tuple containing X and y for the Bank dataset accessible by
        index or name.

    See also:
        :func:`sklearn.datasets.fetch_openml`

    Examples:
        >>> bank = fetch_bank()
        >>> bank.X.shape
        (45211, 15)

        >>> bank_nona = fetch_bank(dropna=True)
        >>> bank_nona.X.shape
        (7842, 15)

        >>> bank_num = fetch_bank(numeric_only=True)
        >>> bank_num.X.shape
        (45211, 6)
    """
    # TODO: this seems to be an old version
    df = to_dataframe(
        fetch_openml(data_id=1558 if percent10 else 1461,
                     data_home=data_home or DATA_HOME_DEFAULT,
                     target_column=None))
    df.columns = [
        'age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
        'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
        'previous', 'poutcome', 'deposit'
    ]
    # remap target
    df.deposit = df.deposit.map({'1': 'no', '2': 'yes'}).astype('category')
    df.deposit = df.deposit.cat.as_ordered()  # 'no' < 'yes'
    # replace 'unknown' marker with NaN
    df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True)
             if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s)
    # 'primary' < 'secondary' < 'tertiary'
    df.education = df.education.astype('category').cat.as_ordered()

    return standardize_dataset(df,
                               prot_attr='age',
                               target='deposit',
                               usecols=usecols,
                               dropcols=dropcols,
                               numeric_only=numeric_only,
                               dropna=dropna)
Ejemplo n.º 4
0
def fetch_german(data_home=None,
                 binary_age=True,
                 usecols=[],
                 dropcols=[],
                 numeric_only=False,
                 dropna=True):
    """Load the German Credit Dataset.

    Protected attributes are 'sex' ('male' is privileged and 'female' is
    unprivileged) and 'age' (binarized by default as recommended by
    [#kamiran09]_: age >= 25 is considered privileged and age < 25 is considered
    unprivileged; see the binary_age flag to keep this continuous). The outcome
    variable is 'credit-risk': 'good' (favorable) or 'bad' (unfavorable).

    Note:
        By default, the data is downloaded from OpenML. See the `credit-g
        <https://www.openml.org/d/31>`_ page for details.

    Args:
        data_home (string, optional): Specify another download and cache folder
            for the datasets. By default all AIF360 datasets are stored in
            'aif360/sklearn/data/raw' subfolders.
        binary_age (bool, optional): If ``True``, split protected attribute,
            'age', into 'aged' (privileged) and 'youth' (unprivileged). The
            'age' feature remains continuous.
        usecols (single label or list-like, optional): Column name(s) to keep.
            All others are dropped.
        dropcols (single label or list-like, optional): Column name(s) to drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        namedtuple: Tuple containing X and y for the German dataset accessible
        by index or name.

    See also:
        :func:`sklearn.datasets.fetch_openml`

    References:
        .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without
           discriminating," 2nd International Conference on Computer,
           Control and Communication, 2009.
           <https://ieeexplore.ieee.org/abstract/document/4909197>`_

    Examples:
        >>> german = fetch_german()
        >>> german.X.shape
        (1000, 21)

        >>> german_num = fetch_german(numeric_only=True)
        >>> german_num.X.shape
        (1000, 7)



        >>> X, y = fetch_german(numeric_only=True)
        >>> y_pred = LogisticRegression().fit(X, y).predict(X)
        >>> disparate_impact_ratio(y, y_pred, prot_attr='age', priv_group=True,
        ... pos_label='good')
        0.9483094846144106
    """
    df = to_dataframe(
        fetch_openml(data_id=31,
                     target_column=None,
                     data_home=data_home or DATA_HOME_DEFAULT))

    df = df.rename(columns={'class': 'credit-risk'})  # more descriptive name
    df['credit-risk'] = df['credit-risk'].cat.as_ordered()  # 'bad' < 'good'

    # binarize protected attribute (but not corresponding feature)
    age = (pd.cut(df.age, [0, 25, 100],
                  labels=False if numeric_only else ['young', 'aged'])
           if binary_age else 'age')

    # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female'
    # and all others => 'male'
    personal_status = df.pop('personal_status').str.split(expand=True)
    personal_status.columns = ['sex', 'marital_status']
    df = df.join(personal_status.astype('category'))
    df.sex = df.sex.cat.as_ordered()  # 'female' < 'male'

    # 'no' < 'yes'
    df.foreign_worker = df.foreign_worker.astype('category').cat.as_ordered()

    return standardize_dataset(df,
                               prot_attr=['sex', age, 'foreign_worker'],
                               target='credit-risk',
                               usecols=usecols,
                               dropcols=dropcols,
                               numeric_only=numeric_only,
                               dropna=dropna)
Ejemplo n.º 5
0
def fetch_compas(data_home=None, binary_race=False,
                 usecols=['sex', 'age', 'age_cat', 'race', 'juv_fel_count',
                          'juv_misd_count', 'juv_other_count', 'priors_count',
                          'c_charge_degree', 'c_charge_desc'],
                 dropcols=[], numeric_only=False, dropna=True):
    """Load the COMPAS Recidivism Risk Scores dataset.

    Optionally binarizes 'race' to 'Caucasian' (privileged) or
    'African-American' (unprivileged). The other protected attribute is 'sex'
    ('Male' is *unprivileged* and 'Female' is *privileged*). The outcome
    variable is 'Survived' (favorable) if the person was not accused of a crime
    within two years or 'Recidivated' (unfavorable) if they were.

    Note:
        The values for the 'sex' variable if numeric_only is ``True`` are 1 for
        'Female and 0 for 'Male' -- opposite the convention of other datasets.

    Args:
        data_home (string, optional): Specify another download and cache folder
            for the datasets. By default all AIF360 datasets are stored in
            'aif360/sklearn/data/raw' subfolders.
        binary_race (bool, optional): Filter only White and Black defendants.
        usecols (single label or list-like, optional): Feature column(s) to
            keep. All others are dropped.
        dropcols (single label or list-like, optional): Feature column(s) to
            drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        namedtuple: Tuple containing X and y for the COMPAS dataset accessible
        by index or name.
    """
    cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
                              os.path.basename(COMPAS_URL))
    if os.path.isfile(cache_path):
        df = pd.read_csv(cache_path, index_col='id')
    else:
        df = pd.read_csv(COMPAS_URL, index_col='id')
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        df.to_csv(cache_path)

    # Perform the same preprocessing as the original analysis:
    # https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
    df = df[(df.days_b_screening_arrest <= 30)
          & (df.days_b_screening_arrest >= -30)
          & (df.is_recid != -1)
          & (df.c_charge_degree != 'O')
          & (df.score_text != 'N/A')]

    for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']:
        df[col] = df[col].astype('category')

    # 'Survived' < 'Recidivated'
    cats = ['Survived', 'Recidivated']
    df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category')
    df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True)

    if binary_race:
        # 'African-American' < 'Caucasian'
        df.race = df.race.cat.set_categories(['African-American', 'Caucasian'],
                                             ordered=True)

    # 'Male' < 'Female'
    df.sex = df.sex.astype('category').cat.reorder_categories(
            ['Male', 'Female'], ordered=True)

    return standardize_dataset(df, prot_attr=['sex', 'race'],
                               target='two_year_recid', usecols=usecols,
                               dropcols=dropcols, numeric_only=numeric_only,
                               dropna=dropna)