Beispiel #1
0
def confusion_matrix_per_subgroup(
    dataset: np.ndarray,
    #
    ground_truth: np.ndarray,
    predictions: np.ndarray,
    #
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None,
    #
    labels: Optional[List[Union[str, float]]] = None
) -> Tuple[List[np.ndarray], List[str]]:
    """
    Computes confusion matrices for every defined sub-population.

    This is useful for computing a variety of performance metrics for each
    sub-population.

    For warnings raised by this method please see the documentation of
    :func:`fatf.utils.data.tools.validate_indices_per_bin` function.

    Parameters
    ----------
    dataset, column_index, groupings, numerical_bins_number, \
and treat_as_categorical
        These parameters are described in the documentation of
        :func:`fatf.utils.data.tools.group_by_column` function and are used to
        define a grouping (i.e. sub-populations). If you have your own
        index-based grouping and would like to get sub-population-based
        confusion matrices, please consider using
        :func:`fatf.utils.metrics.tools.confusion_matrix_per_subgroup_indexed`
        function.
    ground_truth, predictions, and labels
        These parameters are described in the documentation of
        :func:`fatf.utils.metrics.tools.get_confusion_matrix` function and are
        used to calculate confusion matrices.

    Returns
    -------
    population_confusion_matrix : List[numpy.ndarray]
        A list of confusion matrices for each sub-population.
    bin_names : List[strings]
        The name of every sub-population (binning results) defined by the
        feature ranges for a numerical feature and feature value sets for a
        categorical feature.
    """
    # pylint: disable=too-many-arguments
    indices_per_bin, bin_names = fudt.group_by_column(dataset, column_index,
                                                      groupings,
                                                      numerical_bins_number,
                                                      treat_as_categorical)

    assert fudt.validate_indices_per_bin(indices_per_bin), \
        'Binned indices list is invalid.'

    population_confusion_matrix = confusion_matrix_per_subgroup_indexed(
        indices_per_bin, ground_truth, predictions, labels)
    return population_confusion_matrix, bin_names
def test_group_by_column_errors():
    """
    Tests :func:`fatf.utils.data.tools.group_by_column` for errors.
    """
    incorrect_shape_error_data = 'The input array should be 2-dimensional.'
    value_error_data = ('The input array should be of a base type (a mixture '
                        'of numerical and textual types).')
    #
    index_error_index = ('*{}* is not a valid column index for the input '
                         'dataset.')
    type_error_index = 'The column index can either be a string or an integer.'
    #
    value_error_bins = 'The numerical_bins_number needs to be at least 2.'
    type_error_bins = ('The numerical_bins_number parameter has to be an '
                       'integer.')
    #
    value_error_grouping_num_empty = ('A numerical grouping list has to '
                                      'contain at least one element.')
    type_error_grouping_num_inner = ('For a numerical column all of the '
                                     'grouping items must be numbers. *{}* is '
                                     'not a number.')
    value_error_grouping_num_monotonicity = ('The numbers in the groupings '
                                             'list have to be monotonically '
                                             'increasing.')
    type_error_grouping_num_general = ('Since a numerical column was chosen '
                                       'the grouping must be a list of bin '
                                       'boundaries or None.')
    #
    type_error_grouping_cat_general = ('Since a categorical column was chosen '
                                       'the grouping must be a list of tuples '
                                       'representing categorical values '
                                       'grouping or None for the default '
                                       'grouping.')
    type_error_grouping_cat_tuple = ('For a categorical column all of the '
                                     'grouping items must be tuples. *{}* '
                                     'is not a tuple.')
    value_error_grouping_cat_empty = ('A categorical grouping list has to '
                                      'contain at least one element.')
    value_error_grouping_cat_extra = ('*{}* value is not present in the '
                                      'selected column.')
    value_error_grouping_cat_unique = ('Some values are duplicated across '
                                       'tuples.')
    #
    type_error_tac = 'The treat_as_categorical parameter has to be a boolean.'
    #
    user_warning_val = ('The following values in the selected column were '
                        'not accounted for in the grouping tuples:\n{}.')
    user_warning_ind = ('The following row indices could not be accounted for:'
                        '\n{}.\n For a numerical column there may have been '
                        'some numpy.nan therein. For a categorical column '
                        'some of the column values were probably not '
                        'specified in the grouping, in which case there '
                        'should be a separate user warning.')

    num_array = np.array([[1, 2], [3, 4]])
    cat_array = np.array([['a', 'b'], [3, 4]])

    with pytest.raises(IncorrectShapeError) as exin:
        fudt.group_by_column(np.ones((2, 2, 2)), 1)
    assert str(exin.value) == incorrect_shape_error_data
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(np.array([[1, 2], [3, None]]), None)
    assert str(exin.value) == value_error_data

    with pytest.raises(IndexError) as exin:
        fudt.group_by_column(num_array, 3)
    assert str(exin.value) == index_error_index.format(3)
    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(num_array, None)
    assert str(exin.value) == type_error_index

    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(num_array, 1, numerical_bins_number=1)
    assert str(exin.value) == value_error_bins
    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(num_array, 1, numerical_bins_number='1')
    assert str(exin.value) == type_error_bins

    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(num_array, 1, groupings='a')
    assert str(exin.value) == type_error_grouping_num_general
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(num_array, 1, groupings=[])
    assert str(exin.value) == value_error_grouping_num_empty
    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(num_array, 1, groupings=[5, 7.3, 8, 'a'])
    assert str(exin.value) == type_error_grouping_num_inner.format('a')
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(num_array, 1, groupings=[5, 7.3, 8, 7.9, 11])
    assert str(exin.value) == value_error_grouping_num_monotonicity

    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(cat_array, 1, groupings='a')
    assert str(exin.value) == type_error_grouping_cat_general
    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(cat_array, 0, groupings=[('3', ), ['a'], ('a', )])
    assert str(exin.value) == type_error_grouping_cat_tuple.format("['a']")
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(cat_array, 1, groupings=[])
    assert str(exin.value) == value_error_grouping_cat_empty
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(cat_array, 0, groupings=[('3', 'a'), ('1', )])
    assert str(exin.value) == value_error_grouping_cat_extra.format('1')
    with pytest.raises(ValueError) as exin:
        fudt.group_by_column(cat_array, 0, groupings=[('3', 'a'), ('a', )])
    assert str(exin.value) == value_error_grouping_cat_unique

    with pytest.raises(TypeError) as exin:
        fudt.group_by_column(cat_array, 0, treat_as_categorical='None')
    assert str(exin.value) == type_error_tac

    with pytest.warns(UserWarning) as warning:
        grp, grpn = fudt.group_by_column(cat_array, 0, groupings=[('3', )])
    assert len(warning) == 2
    assert user_warning_val.format("{'a'}") == str(warning[0].message)
    assert user_warning_ind.format('{0}') == str(warning[1].message)
    assert grp == [[1]]
    assert grpn == ["('3',)"]
    #
    nan_array = np.array([[0, np.inf], [0, 7], [0, -np.inf], [0, np.nan]])
    with pytest.warns(UserWarning) as warning:
        grp, grpn = fudt.group_by_column(nan_array, 1, groupings=[1])
    assert len(warning) == 1
    assert user_warning_ind.format('{3}') == str(warning[0].message)
    assert grp == [[2], [0, 1]]
    assert grpn == ['x <= 1', '1 < x']
def test_group_by_column():
    """
    Tests :func:`fatf.utils.data.tools.group_by_column`.
    """
    user_warning_tac = ('Selected feature is categorical, therefore cannot be '
                        'treated as numerical. The feature will be treated as '
                        'categorical despite the treat_as_categorical '
                        'parameter set to False.')

    n_1_grp = [[0, 1, 2, 5], [4], [], [], [3]]
    n_1_grps = ['x <= 7.6', '7.6 < x <= 16.2',
                '16.2 < x <= 24.799999999999997',
                '24.799999999999997 < x <= 33.4', '33.4 < x']  # yapf: disable
    n_0_grp = [[0, 5], [4], [1, 2, 3]]
    n_0_grps = ['x <= 0.05', '0.05 < x <= 7.7', '7.7 < x']
    n_2_grp = [[3], [0, 1, 2, 4, 5]]
    n_2_grps = ['x <= -6.5', '-6.5 < x']

    c_1_grp_d = [[0, 4], [3], [1, 2], [5]]
    c_1_grps_d = ["('a+',)", "('a-',)", "('b+',)", "('b-',)"]
    c_1_grp_c = [[0, 3, 4], [1, 2, 5]]
    c_1_grps_c = ["('a+', 'a-')", "('b+', 'b-')"]

    num_array = np.array([
        [0, 5, 6],
        [9, -1, 5],
        [14, 7, 2],
        [55, 42, -22],
        [7.7, 8.8, 9],
        [0.01, 7.0001, 5]
    ])  # yapf: disable
    struct_array = np.array(
        [(0, 'a+', 6),
         (9, 'b+', 5),
         (14, 'b+', 2),
         (55, 'a-', -22),
         (7.7, 'a+', 9),
         (0.01, 'b-', 5)],
        dtype=[('a', np.float32), ('b', 'U2'), ('c', np.int32)]
    )  # yapf: disable
    cat_array = np.array([
        ['a', 'a+', '1'],
        ['b', 'b+', '2'],
        ['b', 'b+', '3'],
        ['a', 'a-', '3'],
        ['b', 'a+', '2'],
        ['b', 'b-', '1']
    ])  # yapf: disable

    # Classic array, numerical -- all default
    grp, grpn = fudt.group_by_column(num_array, 1)
    assert grp == n_1_grp
    assert grpn == n_1_grps
    grp, grpn = fudt.group_by_column(num_array, 1, treat_as_categorical=False)
    assert grp == n_1_grp
    assert grpn == n_1_grps
    grp, grpn = fudt.group_by_column(num_array, 2, treat_as_categorical=True)
    assert grp == [[3], [2], [1, 5], [0], [4]]
    assert grpn == ['(-22.0,)', '(2.0,)', '(5.0,)', '(6.0,)', '(9.0,)']

    # Structured array, numerical -- custom bins number (treat_as_categorical)
    grp, grpn = fudt.group_by_column(
        struct_array, 'c', numerical_bins_number=2)
    assert grp == n_2_grp
    assert grpn == n_2_grps

    # Structured array, numerical -- custom intervals
    grp, grpn = fudt.group_by_column(struct_array, 'a', groupings=[0.05, 7.7])
    assert grp == n_0_grp
    assert grpn == n_0_grps

    # Classic array, categorical -- default binning (treat_as_categorical)
    grp, grpn = fudt.group_by_column(cat_array, 1)
    assert grp == c_1_grp_d
    assert grpn == c_1_grps_d
    grp, grpn = fudt.group_by_column(cat_array, 1, treat_as_categorical=True)
    assert grp == c_1_grp_d
    assert grpn == c_1_grps_d
    with pytest.warns(UserWarning) as warning:
        grp, grpn = fudt.group_by_column(
            cat_array, 1, treat_as_categorical=False)
    assert len(warning) == 1
    assert str(warning[0].message) == user_warning_tac
    assert grp == c_1_grp_d
    assert grpn == c_1_grps_d
    grp, grpn = fudt.group_by_column(
        cat_array, 1, groupings=[('a-', ), ('b+', ), ('a+', ), ('b-', )])
    assert grp == c_1_grp_d
    assert grpn == c_1_grps_d

    # Structured array, categorical -- custom bins
    grp, grpn = fudt.group_by_column(
        struct_array, 'b', groupings=[('a-', 'a+'), ('b-', 'b+')])
    assert grp == c_1_grp_c
    assert grpn == c_1_grps_c
# ---------------------
#
# The measure of Sample Size Disparity can be achieved by calling the
# :func:`fatf.utils.data.tools.group_by_column` grouping function and counting
# the number of instances in each group. By doing that for the *target vector*
# (ground truth) we can see whether the classes in our data set are balanced
# for each sub-group defined by a specified set of values for that feature.
#
# In the example below we will check whether there are roughly the same number
# of data points collected for *males* and *females*. Then we will see whether
# the class distribution (*fail* and *success*) for these two sub-populations
# is similar.

# Group the data based on the unique values of the 'gender' column
grouping_column = 'gender'
grouping_indices, grouping_names = fatf_data_tools.group_by_column(
    hr_X, grouping_column, treat_as_categorical=True)

# Print out the data distribution for the grouping
print('The grouping based on the *{}* feature has the '
      'following distribution:'.format(grouping_column))
for grouping_name, grouping_idx in zip(grouping_names, grouping_indices):
    print('    * "{}" grouping has {} instances.'.format(
        grouping_name, len(grouping_idx)))

# Get the class distribution for each sub-grouping
grouping_class_distribution = dict()
for grouping_name, grouping_idx in zip(grouping_names, grouping_indices):
    sg_y = hr_y[grouping_idx]
    sg_classes, sg_counts = np.unique(sg_y, return_counts=True)

    grouping_class_distribution[grouping_name] = dict()
Beispiel #5
0
def sampling_bias(
    dataset: np.ndarray,
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None
) -> Tuple[List[int], np.ndarray, List[str]]:
    """
    Computes information needed for evaluating and remedying sampling bias.

    Computes the *number of instances* per sub-population defined by the input
    parameters, the *weights* that can be used for cost-sensitive learning to
    mitigate the sampling bias and the *names* of each sub-population (in terms
    of the selected feature and its values).

    .. note::
       To evaluate the sampling bias in terms of a binary ``True``/``False``
       answer please use the
       :func:`fatf.accountability.data.measures.sampling_bias_check` function
       or :func:`fatf.accountability.data.measures.sampling_bias_grid_check`
       function to see sub-population pairwise sampling bias.

    For warnings raised by this method please see the documentation of
    :func:`fatf.utils.data.tools.validate_indices_per_bin` function.

    Parameters
    ----------
    dataset, column_index, groupings, numerical_bins_number, and \
treat_as_categorical
        These parameters are described in the documentation of
        :func:`fatf.utils.data.tools.group_by_column` function and are used to
        define a grouping (i.e. sub-populations). If you have your own
        index-based grouping and would like to get counts and weights for
        cost-sensitive learning, please consider using
        :func:`fatf.accountability.data.measures.sampling_bias_indexed`
        function.

    Returns
    -------
    counts : List[integers]
        A number of data points for each sub-population defined by partitioning
        of the selected feature.
    weights : numpy.ndarray
        A weight for every instance (that could be grouped, i.e. assigned to
        one of the sub-populations) in the input ``dataset``. The weights are
        useful for training a cost-sensitive classifier to mitigate the
        sampling bias. The weights are inversely proportional to the number of
        instance occurrences for every sub-population.
    bin_names : List[strings]
        The name of every sub-population (binning results) defined by the
        feature ranges for a numerical feature and feature value sets for a
        categorical feature.
    """
    indices_per_bin, bin_names = fudt.group_by_column(dataset, column_index,
                                                      groupings,
                                                      numerical_bins_number,
                                                      treat_as_categorical)

    assert fudt.validate_indices_per_bin(indices_per_bin), \
        'Binned indices list is invalid.'

    counts = [len(i) for i in indices_per_bin]
    weights = _get_weights(indices_per_bin)
    return counts, weights, bin_names