Esempio n. 1
0
def _validate_data_header(X: np.ndarray, y: np.ndarray, n_samples: int,
                          n_features: int, y_names: np.ndarray) -> bool:
    """
    Checks if reading in data is consistent by ... the csv header.

    For details on valid header formatting see the
    :func:`fatf.utils.datasets.load_data` documentation.

    Parameters
    ----------
    X : numpy.ndarray
        Array read in from ``numpy.genfromtxt``.
    y : numpy.ndarray
        Target variable indicating which class each sample in ``X`` belongs to.
    n_samples : integer
        Number of samples expected in ``X`` and ``y``.
    n_features : integer
        Number of features expected in ``X``.
    y_names : numpy.ndarray
        Unique class names of the target variable ``y``.

    Raises
    ------
    ValueError
        The number of samples in ``X`` and ``y`` or the number of features in
        the dataset ``X`` is not consistent with the header. Also, raised when
        the number of unique classes in ``y`` is not consistent with the
        header.

    Returns
    -------
    is_consistent : boolean
        True if the header is consistent with the data, False otherwise.
    """
    # pylint: disable=invalid-name
    assert fuav.is_2d_array(X), 'X has to be a 2-dimensional array.'
    assert fuav.is_1d_array(y), 'y has to be a 1-dimensional array.'
    assert fuav.is_1d_array(y_names), 'y_names must be a 1-dimensional array.'

    is_consistent = False
    if X.shape[0] != n_samples:
        raise ValueError('The number of samples in the dataset is not '
                         'consistent with the header.')
    # Use len(X[0]) in case X is structured array.
    if len(X[0]) != n_features:
        raise ValueError('The number of features in the dataset is not '
                         'consistent with the header.')
    if y.shape[0] != n_samples:
        raise ValueError('The number of labels (target variables) is not '
                         'consistent with the header.')
    if y_names.shape[0]:
        if y_names.shape[0] != np.unique(y).shape[0]:
            raise ValueError('The number of classes is not consistent with '
                             'the header.')

    is_consistent = True
    return is_consistent
Esempio n. 2
0
def confusion_matrix_per_subgroup_indexed(
        indices_per_bin: List[np.ndarray],
        ground_truth: np.ndarray,
        predictions: np.ndarray,
        labels: Optional[List[Union[str, float]]] = None) -> List[np.ndarray]:
    """
    Computes confusion matrices for every defined sub-population.

    This is useful for computing a variety of performance metrics based on
    predefined instance index binning for each sub-population.

    This is an alternative to
    :func:`fatf.utils.metrics.tools.confusion_matrix_per_subgroup` function,
    which can be used when one already has the desired instance binning.

    For warnings and errors raised by this method please see the documentation
    of :func:`fatf.utils.data.tools.validate_indices_per_bin` function.

    Parameters
    ----------
    indices_per_bin : List[List[integer]]
        A list of lists with the latter one holding row indices of a particular
        group (sub-population).
    ground_truth, predictions, and labels
        These parameters are described in the documentation of
        :func:`fatf.utils.metrics.tools.get_confusion_matrix` function and are
        used to calculate confusion matrices.

    Returns
    -------
    population_confusion_matrix : List[numpy.ndarray]
        A list of confusion matrices for each sub-population.
    """
    assert fudt.validate_indices_per_bin(indices_per_bin), \
        'Binned indices list is invalid.'

    if labels is None:
        if not fuav.is_1d_array(ground_truth):
            raise IncorrectShapeError('The ground_truth parameter should be a '
                                      '1-dimensional numpy array.')
        if not fuav.is_1d_array(predictions):
            raise IncorrectShapeError('The predictions parameter should be a '
                                      '1-dimensional numpy array.')
        labels = np.sort(np.unique(np.concatenate([ground_truth,
                                                   predictions]))).tolist()

    population_confusion_matrix = []
    for bin_indices in indices_per_bin:
        confusion_matrix = get_confusion_matrix(ground_truth[bin_indices],
                                                predictions[bin_indices],
                                                labels)
        population_confusion_matrix.append(confusion_matrix)
    return population_confusion_matrix
Esempio n. 3
0
    def discretise(
            self, dataset: Union[np.ndarray,
                                 np.void]) -> Union[np.ndarray, np.void]:
        """
        Discretises numerical features of the ``dataset`` into quartiles.

        Parameters
        ----------
        dataset : Union[numpy.ndarray, numpy.void]
            A data point (1-D) or an array (2-D) of data points to be
            discretised.
        Raises
        ------
        IncorrectShapeError
            The input ``dataset`` is neither 1- nor 2-dimensional numpy array.
            The number of features (columns) in the input ``dataset`` is
            different than the number of features in the dataset used to
            initialise this object.
        TypeError
            The dtype of the input ``dataset`` is too different from the dtype
            of the dataset used to initialise this object.

        Returns
        -------
        discretised_data : Union[numpy.ndarray, numpy.void]
            A discretised data array.
        """
        self._validate_input_discretise(dataset)

        if self.is_structured and fuav.is_1d_like(dataset):
            discretised_dataset = dataset.copy().astype(self.discretised_dtype)
        else:
            discretised_dataset = np.zeros_like(dataset,
                                                dtype=self.discretised_dtype)

        for feature in self.categorical_indices:
            if self.is_structured or fuav.is_1d_array(dataset):
                discretised_dataset[feature] = dataset[feature]
            else:
                discretised_dataset[:, feature] = dataset[:, feature]

        for feature, boundaries in self.feature_bin_boundaries.items():
            if self.is_structured or fuav.is_1d_array(dataset):
                discretised_dataset[feature] = np.searchsorted(
                    boundaries, dataset[feature])
            else:
                discretised_dataset[:, feature] = np.searchsorted(
                    boundaries, dataset[:, feature])

        return discretised_dataset
Esempio n. 4
0
def structured_to_unstructured(
        structured_array: np.ndarray,
        **kwargs: Optional[np.dtype]) -> np.ndarray:  # pragma: no cover
    """
    Calls either local or numpy's structured_to_unstructured function.

    numpy 1.16.0 has introduced
    :func:`numpy.lib.recfunctions.structured_to_unstructured` function. To
    ensure backwards compatibility up to numpy 1.9.0 this package implements
    its own version of this function
    (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`).
    This function calls the latter if numpy version below 1.16.0 is installed.
    However, if numpy 1.16.0 or above is detected, numpy's implementation is
    used instead.

    For the description of ``structured_to_unstructured`` functionality either
    refer to the corresponding numpy
    (:func:`numpy.lib.recfunctions.structured_to_unstructured`) or local
    (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`)
    documentation.

    .. warning:: Since this function either calls a local implementation or a
       builtin numpy function there may be some inconsistencies in its
       behaviour. One that we are aware of is conversion of arrays that contain
       ``'V'`` -- raw data (void), ``'O'`` -- (Python) objects, ``'M'`` --
       datetime or ``'m'`` -- timedelta dtypes. These types are not supported
       by the local implementation, however some of them are supported by the
       numpy built-in, e.g. the ``'V'`` type.

    Parameters
    ----------
    structured_array : numpy.ndarray
        A structured numpy array to be converted into a plane numpy array.
    **kwargs : Optional[numpy.dtype]
        Named parameters that are passed to the appropriate structured to
        unstructured array converter. These parameters are ignored when calling
        the local implementation
        (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`).

    Returns
    -------
    classic_array : numpy.ndarray
        A classic numpy array representation of the ``structured_array`` with
        the most generic type out of the input array's dtypes.
    """
    # pylint: disable=no-member
    if _LOCAL_STRUCTURED_TO_UNSTRUCTURED:
        classic_array = fatf_structured_to_unstructured(structured_array)
    else:
        classic_array = recfn.structured_to_unstructured(
            structured_array, **kwargs)
        if (fuav.is_2d_array(structured_array)
                and fuav.is_1d_array(classic_array)):
            classic_array = classic_array.reshape(
                (structured_array.shape[0], 1))
    return classic_array
Esempio n. 5
0
def are_indices_valid(array: np.array, indices: np.array) -> bool:
    """
    Checks whether all the input ``indices`` are valid for the input ``array``.

    Parameters
    ----------
    array : numpy.array
        The 2-dimensional array to be checked.
    indices : numpy.array
        1-dimensional array of column indices.

    Raises
    ------
    TypeError
        Either of the input arrays is not a numpy array-like object.
    IncorrectShapeError
        The input array is not 2-dimensional or the indices arrays in not
        1-dimensional.

    Returns
    -------
    is_valid : boolean
        A Boolean variable that indicates whether the input column indices are
        valid indices for the input array.
    """
    if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)):
        raise TypeError('Input arrays should be numpy array-like objects.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_1d_array(indices):
        raise IncorrectShapeError('The indices array should be 1-dimensional.')

    invalid_indices = get_invalid_indices(array, indices)
    assert fuav.is_1d_array(invalid_indices), 'This should be a 1-d array.'

    is_valid = not bool(invalid_indices.shape[0])
    return is_valid
Esempio n. 6
0
def get_invalid_indices(array: np.ndarray, indices: np.ndarray) -> np.ndarray:
    """
    Returns a numpy array with column indices that the input array is missing.

    Parameters
    ----------
    array : numpy.ndarray
        A 2-dimensional array to be checked.
    indices : numpy.ndarray
        A 1-dimensional array of indices corresponding to columns in the input
        array.

    Raises
    ------
    TypeError
        Either of the input arrays is not a numpy array-like object.
    IncorrectShapeError
        The input array is not 2-dimensional or the indices arrays in not
        1-dimensional.

    Returns
    -------
    invalid_indices : numpy.ndarray
        A **sorted** array of indices that were not found in the input array.
    """
    if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)):
        raise TypeError('Input arrays should be numpy array-like objects.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_1d_array(indices):
        raise IncorrectShapeError('The indices array should be 1-dimensional.')

    if fuav.is_structured_array(array):
        array_indices = set(array.dtype.names)
    else:
        array_indices = set(range(array.shape[1]))

    # Alternatively use numpy's np.isin (which supersedes np.in1d):
    # invalid_indices = indices[np.isin(indices, array_indices, invert=True)]
    # or np.setdiff1d: invalid_indices = np.setdiff1d(indices, array_indices)
    invalid_indices = set(indices.tolist()) - array_indices
    return np.sort(list(invalid_indices))
Esempio n. 7
0
def _input_is_valid(distances: np.ndarray) -> bool:
    """
    Validates input parameters of a kernel function.

    Parameters
    ----------
    distances : numpy.ndarray
        A 1-dimensional numpy array of distances.

    Raises
    ------
    IncorrectShapeError
        The ``distances`` array is not a 1-dimensional numpy array.
    TypeError
        The ``distances`` array is a structured numpy array or it is not a
        purely numerical array.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_input_ok = False

    if fuav.is_structured_array(distances):
        raise TypeError('The distances array cannot be a structured array.')

    if not fuav.is_1d_array(distances):
        raise IncorrectShapeError('The distances array must be a '
                                  '1-dimensional array.')

    if not fuav.is_numerical_array(distances):
        raise TypeError('The distances array must be of numerical type.')

    is_input_ok = True
    return is_input_ok
Esempio n. 8
0
def get_confusion_matrix(
        ground_truth: np.ndarray,
        predictions: np.ndarray,
        labels: Optional[List[Union[str, float]]] = None) -> np.ndarray:
    """
    Computes a confusion matrix based on predictions and ground truth vectors.

    The confusion matrix (a.k.a. contingency table) has predictions in rows
    and ground truth in columns. If the value order is not provide via the
    ``labels`` parameter, the ordering is based on the alphanumeric sorting
    of the unique values in both of the input arrays.

    Parameters
    ----------
    ground_truth : numpy.ndarray
        An array holding the *true* target values.
    predictions : numpy.ndarray
        An array holding *predictions* of the target values.
    labels : List[string, number], optional (default=None)
        If a certain ordering of the labels in the confusion matrix is desired,
        it can be specified via this parameter. By default alphanumeric sorting
        is used.

    Warns
    -----
    UserWarning
        Some of the labels provided by the user are not present in either of
        the input arrays.

    Raises
    ------
    IncorrectShapeError
        The ``ground_truth`` and/or ``labels`` vectors are not 1-dimensional.
        The length of these two arrays does not agree.
    TypeError
        The ``labels`` parameter is not a list.
    ValueError
        The ``labels`` list empty, it contains duplicate entries or some of the
        labels present in either of the input array are not accounted for by
        the ``labels`` list.

    Returns
    -------
    confusion_matrix : numpy.ndarray
        A confusion matrix.
    """
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth vector has to be '
                                  '1-dimensional numpy array.')
    if not fuav.is_1d_array(predictions):
        raise IncorrectShapeError('The predictions vector has to be '
                                  '1-dimensional numpy array.')
    if ground_truth.shape[0] != predictions.shape[0]:
        raise IncorrectShapeError('Both the ground truth and the predictions '
                                  'vectors have to have the same length.')

    all_values = np.concatenate([ground_truth, predictions])
    if labels is None:
        ordering = np.sort(np.unique(all_values)).tolist()
    elif isinstance(labels, list):
        if not labels:
            raise ValueError('The labels list cannot be empty.')
        labels_set = set(labels)
        if len(labels_set) != len(labels):
            raise ValueError('The labels list contains duplicates.')

        extra_labels = labels_set.difference(all_values)
        if extra_labels:
            warnings.warn(
                'Some of the given labels are not present in either of the '
                'input arrays: {}.'.format(extra_labels), UserWarning)

        unaccounted_labels = set(all_values).difference(labels_set)
        if unaccounted_labels:
            raise ValueError('The following labels are present in the input '
                             'arrays but were not given in the labels '
                             'parameter: {}.'.format(unaccounted_labels))

        ordering = labels
    else:
        raise TypeError('The labels parameter has to either a list or None.')

    confusion_matrix_list = []
    for pred in ordering:
        pdt = predictions == pred
        row = [np.logical_and(pdt, ground_truth == i).sum() for i in ordering]
        confusion_matrix_list.append(row)

    confusion_matrix = np.array(confusion_matrix_list)
    return confusion_matrix
def _validate_input(ice_pdp_array: np.ndarray,
                    feature_linespace: np.ndarray,
                    class_index: int,
                    feature_name: Union[None, str],
                    class_name: Union[None, str],
                    plot_axis: Union[None, plt.Axes],
                    test_partial_dependence: bool = False) -> bool:
    """
    Validates input parameters for ICE and PD plotting functions.

    Validates input parameters for
    :func:`fatf.vis.feature_influence.plot_individual_conditional_expectation`
    and :func:`fatf.vis.feature_influence.plot_partial_dependence` functions.

    Parameters
    ----------
    ice_pdp_array : numpy.ndarray
        An array that contains ICE or PD calculations.
    feature_linespace : numpy.ndarray
        An array that contains the values for which the selected feature was
        sampled.
    class_index : integer
        The index of the class for which the plot will be created.
    feature_name : string or None
        The name of the feature for which ICE or PD was originally calculated.
    class_name : string or None
        The name of the class that ``class_index`` parameter points to.
    plot_axis : matplotlib.pyplot.Axes or None
        A matplotlib axis object to plot on top of.
    test_partial_dependence : boolean
        Whether to treat the input array as PD or ICE calculation result.

    Raises
    ------
    IncorrectShapeError
        The ICE or the PD array has a wrong number of dimensions (3 and 2
        respectively). The feature linespace array has a wrong number of
        dimensions -- 1 is expected.
    IndexError
        The class index is invalid for the input array.
    TypeError
        The class index is not an integer; the feature name is not a string or
        a ``None``; the class name is not a string or a ``None``; the plot axis
        is not a matplotlib.pyplot.Axes type object or a ``None``.
    ValueError
        The input array is structured or is not numerical. The linespace array
        is structured, not numerical or its length does not agree with the
        number of steps in the input array.

    Returns
    -------
    input_is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches
    input_is_valid = False

    assert isinstance(test_partial_dependence, bool), \
        'test_partial_dependence is not a boolean.'

    if fuav.is_structured_array(ice_pdp_array):
        raise ValueError('The input array cannot be a structured array.')
    if not fuav.is_numerical_array(ice_pdp_array):
        raise ValueError('The input array has to be a numerical array.')

    if test_partial_dependence:
        if len(ice_pdp_array.shape) != 2:
            raise IncorrectShapeError('plot_partial_depenedence expects a '
                                      '2-dimensional array of shape (n_steps, '
                                      'n_classes).')
    else:
        if len(ice_pdp_array.shape) != 3:
            raise IncorrectShapeError('plot_individual_condtional_expectation '
                                      'expects a 3-dimensional array of shape '
                                      '(n_samples, n_steps, n_classes).')

    if fuav.is_structured_array(feature_linespace):
        raise ValueError('The linespace array cannot be a structured array.')
    if not fuav.is_1d_array(feature_linespace):
        raise IncorrectShapeError('The linespace array has to be a '
                                  '1-dimensional array of shape (n_steps, ).')
    if not fuav.is_numerical_array(feature_linespace):
        raise ValueError('The linespace array has to be numerical.')
    if feature_linespace.shape[0] != ice_pdp_array.shape[-2]:
        raise ValueError('The length of the linespace array ({}) does not '
                         'agree with the number of linespace steps ({}) in '
                         'the input array.'.format(feature_linespace.shape[0],
                                                   ice_pdp_array.shape[-2]))

    # Is the index valid for the array
    if not isinstance(class_index, int):
        raise TypeError('Class index has to be an integer.')
    if class_index < 0 or class_index >= ice_pdp_array.shape[-1]:
        raise IndexError('Class index {} is not a valid index for the '
                         'input array. There are only {} classes '
                         'available.'.format(class_index,
                                             ice_pdp_array.shape[-1]))

    if feature_name is not None and not isinstance(feature_name, str):
        raise TypeError('The feature name has to be either None or a string.')

    if class_name is not None and not isinstance(class_name, str):
        raise TypeError('The class name has to be either None or a string.')

    if plot_axis is not None and not isinstance(plot_axis, plt.Axes):
        raise TypeError('The plot axis has to be either None or a matplotlib.'
                        'pyplot.Axes type object.')

    input_is_valid = True
    return input_is_valid
Esempio n. 10
0
def individual_conditional_expectation(
        dataset: np.ndarray,
        model: object,
        feature_index: Union[int, str],
        treat_as_categorical: Optional[bool] = None,
        steps_number: Optional[int] = None,
        include_rows: Optional[Union[int, List[int]]] = None,
        exclude_rows: Optional[Union[int, List[int]]] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculates Individual Conditional Expectation for a selected feature.

    Based on the provided dataset and model this function computes Individual
    Conditional Expectation (ICE) of a selected feature for all target classes.
    If ``treat_as_categorical`` parameter is not provided the function will
    infer the type of the selected feature and compute the appropriate ICE.
    Otherwise, the user can specify whether the selected feature should be
    treated as a categorical or numerical feature. If the selected feature is
    numerical, you can specify the number of samples between this feature's
    minimum and maximum value for which the input model will be evaluated.
    By default this value is set to 100.

    Finally, it is possible to filter the rows of the input dataset that will
    be used to calculate ICE with ``include_rows`` and ``exclude_rows``
    parameters. If ``include_rows`` is specified ICE will only be calculated
    for these rows. If both include and exclude parameters are given, ICE will
    be computed for the set difference. Finally, if only the exclude parameter
    is specified, these rows will be subtracted from the whole dataset.

    This approach is an implementation of a method introduced by
    [GOLDSTEIN2015PEEKING]_. It is intended to be used with probabilistic
    models, therefore the input model must have a ``predict_proba`` method.

    .. [GOLDSTEIN2015PEEKING] Goldstein, A., Kapelner, A., Bleich, J. and
       Pitkin, E., 2015. Peeking inside the black box: Visualizing statistical
       learning with plots of individual conditional expectation. Journal of
       Computational and Graphical Statistics, 24(1), pp.44-65.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset based on which ICE will be computed.
    model : object
        A fitted model which predictions will be used to calculate ICE. (Please
        see :class:`fatf.utils.models.models.Model` class documentation for the
        expected model object specification.)
    feature_index : Union[integer, string]
        An index of the feature column in the input dataset for which ICE will
        be computed.
    treat_as_categorical : boolean, optional (default=None)
        Whether to treat the selected feature as categorical or numerical.
    steps_number : integer, optional (default=None, i.e. 100)
        The number of evenly spaced samples between the minimum and the maximum
        value of the selected feature for which the model's prediction will be
        evaluated. (This parameter applies only to numerical features.)
    include_rows : Union[int, List[int]], optional (default=None)
        Indices of rows that will be included in the ICE calculation. If this
        parameter is specified, ICE will only be calculated for the selected
        rows. If additionally ``exclude_rows`` is specified the selected rows
        will be a set difference between the two. This parameter can either be
        a *list* of indices or a single index (integer).
    exclude_rows : Union[int, List[int]], optional (default=None)
        The indices of rows to be excluded from the ICE calculation. If this
        parameter is specified and ``include_rows`` is not, these indices will
        be excluded from all of the rows. If both include and exclude
        parameters are specified, the rows included in the ICE calculation will
        be a set difference of the two. This parameter can either be a *list*
        of indices or a single index (integer).

    Warns
    -----
    UserWarning
        The feature is treated as categorical but the number of steps parameter
        is provided (not ``None``). In this case the ``steps_number`` parameter
        is ignored. Also, the user is warned when the selected feature is
        detected to be categorical (textual) while the user indicated that it
        is numerical.

    Raises
    ------
    IncompatibleModelError
        The model does not have required functionality -- it needs to be able
        to output probabilities via ``predict_proba`` method.
    IncorrectShapeError
        The input dataset is not a 2-dimensional numpy array.
    IndexError
        Provided feature (column) index is invalid for the input dataset.
    TypeError
        ``treat_as_categorical`` is not ``None`` or boolean. The
        ``steps_number`` parameter is not ``None`` or integer. Either
        ``include_rows`` or ``exclude_rows`` parameter is not ``None``, an
        integer or a list of integers.
    ValueError
        The input dataset must only contain base types (textual and numerical
        values). One of the ``include_rows`` or ``exclude_rows`` indices is not
        valid for the input dataset. The ``steps_number`` is smaller than 2.

    Returns
    -------
    ice : numpy.ndarray
        An array of Individual Conditional Expectations for all of the selected
        dataset rows and the feature (dataset column) of choice. It's of the
        (n_samples, steps_number, n_classes) shape where n_samples is the
        number of rows selected from the dataset for the ICE computation,
        steps_number is the number of generated samples for the selected
        feature and n_classes is the number of classes in the target of the
        dataset. The numbers in this array represent the probability of every
        class for every selected data point when the selected feature is fixed
        to one of the values in the generated feature linespace (see below).
    feature_linespace : numpy.ndarray
        A one-dimensional array -- (steps_number, ) -- with the values for
        which the selected feature was substituted when the dataset was
        evaluated with the specified model.
    """
    # pylint: disable=too-many-arguments,too-many-locals
    assert _input_is_valid(dataset, model, feature_index, treat_as_categorical,
                           steps_number), 'Input must be valid.'

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        column = dataset[feature_index]
    else:
        column = dataset[:, feature_index]
    assert fuav.is_1d_array(column), 'Column must be a 1-dimensional array.'

    if fuav.is_numerical_array(column):
        is_categorical_column = False
    elif fuav.is_textual_array(column):
        is_categorical_column = True
    else:
        assert False, 'Must be an array of a base type.'  # pragma: nocover

    # If needed, infer the column type.
    if treat_as_categorical is None:
        treat_as_categorical = is_categorical_column
    elif not treat_as_categorical and is_categorical_column:
        message = ('Selected feature is categorical (string-base elements), '
                   'however the treat_as_categorical was set to False. Such '
                   'a combination is not possible. The feature will be '
                   'treated as categorical.')
        warnings.warn(message, category=UserWarning)
        treat_as_categorical = True
        steps_number = None

    if treat_as_categorical and steps_number is not None:
        warnings.warn(
            'The steps_number parameter will be ignored as the feature is '
            'being treated as categorical.',
            category=UserWarning)

    # If needed, get the default steps number.
    if not treat_as_categorical and steps_number is None:
        steps_number = 100

    rows_number = dataset.shape[0]
    include_r = _filter_rows(include_rows, exclude_rows, rows_number)
    filtered_dataset = dataset[include_r]

    sampled_data, feature_linespace = _interpolate_array(
        filtered_dataset, feature_index, treat_as_categorical, steps_number)

    ice = [
        model.predict_proba(data_slice)  # type: ignore
        for data_slice in sampled_data
    ]
    ice = np.stack(ice, axis=0)

    return ice, feature_linespace
Esempio n. 11
0
def _validate_input(dataset: np.ndarray,
                    ground_truth: Optional[np.ndarray] = None,
                    categorical_indices: Optional[List[Index]] = None,
                    int_to_float: bool = True) -> bool:
    """
    Validates the input parameters of an arbitrary augmentation class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be used for sampling.
    ground_truth : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with labels for the supplied dataset.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    int_to_float : boolean, optional (default=True)
        If ``True``, all of the integer dtype columns in the ``dataset`` will
        be generalised to ``numpy.float64`` type. Otherwise, integer type
        columns will remain integer and floating point type columns will remain
        floating point.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array. The
        ``ground_truth`` array is not a 1-dimensional numpy array. The number
        of ground truth annotation is different than the number of rows in the
        data array.
    IndexError
        Some of the column indices given in the ``categorical_indices``
        parameter are not valid for the input ``dataset``.
    TypeError
        The ``categorical_indices`` parameter is neither a list nor ``None``.
        The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not
        of base (numerical and/or string) type. The ``int_to_float`` parameter
        is not a boolean.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if ground_truth is not None:
        if not fuav.is_1d_array(ground_truth):
            raise IncorrectShapeError('The ground_truth array must be '
                                      '1-dimensional. (Or None if it is not '
                                      'required.)')
        if not fuav.is_base_array(ground_truth):
            raise TypeError('The ground_truth array must be of a base type.')
        if ground_truth.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of labels in the '
                                      'ground_truth array is not equal to the '
                                      'number of data points in the dataset '
                                      'array.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if not isinstance(int_to_float, bool):
        raise TypeError('The int_to_float parameter has to be a boolean.')

    is_valid = True
    return is_valid
Esempio n. 12
0
def apply_to_column_grouping(
        labels: np.ndarray, predictions: np.ndarray,
        row_grouping: List[List[int]], fnc: Callable[[np.ndarray, np.ndarray],
                                                     float]) -> List[float]:
    """
    Applies a function to the specified groups of labels and predictions.

    This functions allows to apply a metric for a particular data grouping. The
    two main applications are group-based fairness and performance evaluation.

    Parameters
    ----------
    labels : numpy.ndarray
        A ground truth numpy array.
    predictions : numpy.ndarray
        A predictions numpy array.
    row_grouping : List[List[integer]]
        A list of lists representing row indices of the ground truth and
        prediction arrays resulting in their grouping.
    fnc : Callable[[numpy.ndarray, numpy.ndarray], number]
        A function (metric) that will be applied to all of the groups defined
        by the ``row_grouping`` parameter.

    Raises
    ------
    AttributeError
        The ``fnc`` parameter does not require two input parameters.
    IncorrectShapeError
        The ``labels`` or ``predictions`` parameter is not a 1-dimensional
        numpy array. The ``labels`` and ``predictions`` arrays are not of the
        same length.
    TypeError
        The ``row_grouping`` parameter is not a list. One of the elements of
        the ``row_grouping`` is not a list. Some of the elements in the inner
        list of the ``row_grouping`` list are not integers. The ``fnc``
        parameter is not a callable (function).
    ValueError
        The ``row_grouping`` parameter is an empty list. Some of the values in
        the ``row_grouping`` list are duplicated.

    Returns
    -------
    applied : List[numbers]
        A list with the ``fnc`` function result for every group defined by the
        ``row_grouping`` parameter.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_1d_array(labels):
        raise IncorrectShapeError('The labels array should be 1-dimensional.')
    if not fuav.is_1d_array(predictions):
        raise IncorrectShapeError('The predictions array should be '
                                  '1-dimensional.')
    if labels.shape[0] != predictions.shape[0]:
        raise IncorrectShapeError('The labels and predictions arrays should '
                                  'be of the same length.')

    if isinstance(row_grouping, list):
        if not row_grouping:
            raise ValueError('The row_grouping parameter cannot be an empty '
                             'list.')
        duplicated_indices = set()  # type: Set[int]
        for i in row_grouping:
            if not isinstance(i, list):
                raise TypeError('All of the elements of the row_grouping list '
                                'have to be lists.')
            if not i:
                raise ValueError('All of the elements of the row_grouping '
                                 'list must be non-empty lists.')
            for j in i:
                if not isinstance(j, int):
                    raise TypeError('All of the elements of the inner lists '
                                    'in the row_grouping have to be integers.')
            if duplicated_indices.intersection(i):
                raise ValueError('Some of the values in the row_grouping are '
                                 'duplicated.')
            duplicated_indices = duplicated_indices.union(i)
    else:
        raise TypeError('The row_grouping parameter has to be a list.')

    if not callable(fnc):
        raise TypeError('The fnc parameter is not callable (a function).')
    required_param_n = 0
    params = inspect.signature(fnc).parameters
    for param in params:
        if params[param].default is params[param].empty:
            required_param_n += 1
    if required_param_n != 2:
        raise AttributeError('Provided function (fnc) does not require 2 '
                             'input parameters. The first required parameter '
                             'should be ground truth labels and the second '
                             'one predictions.')

    applied = [fnc(labels[grp], predictions[grp]) for grp in row_grouping]

    return applied
Esempio n. 13
0
def _validate_input_lasso_path(dataset: np.ndarray, target: np.ndarray,
                               weights: Union[np.ndarray, None],
                               features_number: Union[int, None],
                               features_percentage: int) -> bool:
    """
    Validates the input parameters of the ``lasso_path`` function.

    For the input parameter description, warnings and exceptions please see
    the documentation of the
    :func:`fatf.utils.data.feature_selection.sklearn.lasso_path` function.

    Returns
    -------
    input_is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    input_is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input data set must be a 2-dimensional '
                                  'array.')
    if not fuav.is_numerical_array(dataset):
        raise TypeError('The input data set must be purely numerical. (The '
                        'lasso path feature selection is based on '
                        'sklearn.linear_model.lars_path function.)')

    if not fuav.is_1d_array(target):
        raise IncorrectShapeError('The target array must be a 1-dimensional '
                                  'array.')
    if not fuav.is_numerical_array(target):
        raise TypeError('The target array must be numerical since this '
                        'feature selection method is based on Lasso '
                        'regression.')
    if target.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of labels in the target array '
                                  'must agree with the number of samples in '
                                  'the data set.')

    if weights is not None:
        if not fuav.is_1d_array(weights):
            raise IncorrectShapeError('The weights array must 1-dimensional.')
        if not fuav.is_numerical_array(weights):
            raise TypeError('The weights array must be purely numerical.')
        if weights.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of weights in the weights '
                                      'array must be the same as the number '
                                      'of samples in the input data set.')

    if features_number is not None:
        if not isinstance(features_number, int):
            raise TypeError('The features_number parameter must be an '
                            'integer.')
        if features_number < 1:
            raise ValueError('The features_number parameter must be a '
                             'positive integer.')

    if not isinstance(features_percentage, int):
        raise TypeError('The feature_percentage parameter must be an integer.')
    if features_percentage < 0 or features_percentage > 100:
        raise ValueError('The feature_percentage parameter must be between 0 '
                         'and 100 (inclusive).')

    input_is_valid = True
    return input_is_valid
Esempio n. 14
0
def test_lasso_path(caplog):
    """
    Tests :func:`fatf.utils.data.feature_choice.sklearn.lasso_path` function.
    """
    no_lasso_log = ('The lasso path feature selection could not pick any '
                    'feature subset. All of the features were selected.')
    less_lasso_log = ('The lasso path feature selection could not pick {} '
                      'features. Only {} were selected.')

    assert len(caplog.records) == 0
    fatf.setup_random_seed()
    assert len(caplog.records) == 2
    assert caplog.records[0].levelname == 'INFO'
    assert caplog.records[0].getMessage().startswith('Seeding RNGs ')
    assert caplog.records[1].levelname == 'INFO'
    assert caplog.records[1].getMessage() == 'Seeding RNGs with 42.'

    # Weights and no-weights
    weights = np.ones((NUMERICAL_NP_ARRAY.shape[0], ))
    # Classic array -- weights
    features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET,
                                weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1]))
    # Structured array -- no-weights
    features = fudfs.lasso_path(
        NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_number=2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['a', 'b']))
    #
    # Selecting exactly 4 features -- no need for Lasso
    features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET,
                                weights, 4)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1, 2, 3]))
    # Selecting more than 4 features
    with pytest.warns(UserWarning) as warning:
        features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY,
                                    NUMERICAL_NP_ARRAY_TARGET, weights, 5)
    assert len(warning) == 1
    assert str(warning[0].message) == FEATURE_INDICES_WARNING
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['a', 'b', 'c', 'd']))
    #
    # No features number -- just percentage
    features = fudfs.lasso_path(
        NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=50)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1]))
    # No features number -- just percentage -- too small no features selected
    assert len(caplog.records) == 2
    features = fudfs.lasso_path(
        NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=24)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0]))
    assert len(caplog.records) == 3
    assert caplog.records[2].levelname == 'WARNING'
    assert caplog.records[2].getMessage() == FEATURE_PERCENTAGE_LOG

    # Weights too small so no path is found -- returns all features
    weights = np.array([1, 1, 100, 1, 1, 1]) * 1e-20
    assert len(caplog.records) == 3
    features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET,
                                weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1, 2, 3]))
    assert len(caplog.records) == 4
    assert caplog.records[3].levelname == 'WARNING'
    assert caplog.records[3].getMessage() == no_lasso_log

    # Another selection
    weights = np.array([1, 1, 100, 1, 1, 1])
    features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET,
                                weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 2]))
    features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY,
                                NUMERICAL_NP_ARRAY_TARGET, weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['a', 'c']))

    # Lasso with no possibility of reducing the number of features
    assert len(caplog.records) == 4
    features = fudfs.lasso_path(
        np.array([[1, 2, 3], [2, 2, 3], [3, 2, 3], [4, 2, 3]]),
        np.array([1, 2, 3, 4]),
        features_number=2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0]))
    assert len(caplog.records) == 5
    assert caplog.records[4].levelname == 'WARNING'
    assert caplog.records[4].getMessage() == less_lasso_log.format(2, 1)
Esempio n. 15
0
def _validate_input_local_fidelity(
        dataset: np.ndarray, data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Union[int, None],
        explained_feature_indices: Union[List[IndexType], None],
        fidelity_radius_percentage: int, samples_number: int) -> bool:
    """
    Validates the input parameters for the ``local_fidelity_score`` function.

    This function validates input parameter of the
    :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score`
    function. The description of this function's input parameters, errors and
    exceptions can be found therein.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- numbers '
                        'and/or strings.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured data rows.')

    are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]))
    if not are_similar:
        raise TypeError('The dtype of the data_row is too different from '
                        'the dtype of the dataset array.')

    # If the dataset is structured and the data_row has a different
    # number of features this will be caught by the above dtype check.
    # For classic numpy arrays this has to be done separately.
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data_row must contain the same '
                                      'number of features as the dataset.')

    if callable(global_predictive_function):
        global_params_n = fuv.get_required_parameters_number(
            global_predictive_function)
        if global_params_n != 1:
            raise IncompatibleModelError(
                'The global predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The global_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(local_predictive_function):
        local_params_n = fuv.get_required_parameters_number(
            local_predictive_function)
        if local_params_n != 1:
            raise IncompatibleModelError(
                'The local predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The local_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(metric_function):
        if fuv.get_required_parameters_number(metric_function) != 2:
            raise TypeError('The metric_function must take exactly *two* '
                            'required parameters.')
    else:
        raise TypeError('The metric_function should be a Python callable, '
                        'e.g., a Python function.')

    # Explained class index
    global_prediction = global_predictive_function(dataset[:1])
    assert not fuav.is_structured_array(global_prediction), 'Must be plain.'
    assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.'
    if fuav.is_2d_array(global_prediction):  # A probabilistic model.
        if explained_class_index is not None:
            if isinstance(explained_class_index, int):
                if (explained_class_index >= global_prediction.shape[1]
                        or explained_class_index < 0):
                    raise ValueError('The explained_class_index parameter is '
                                     'negative or larger than the number of '
                                     'classes output by the global '
                                     'probabilistic model.')
            else:
                raise TypeError('For probabilistic global models, i.e., '
                                'global predictive functions, the '
                                'explained_class_index parameter has to be an '
                                'integer or None.')
    elif fuav.is_1d_array(global_prediction):
        if explained_class_index is not None:
            warnings.warn(
                'The explained_class_index parameter is not None and will be '
                'ignored since the global model is not probabilistic.',
                UserWarning)
    else:
        assert False, ('Global predictor must output a 1- or 2-dimensional '
                       'numpy array.')  # pragma: nocover

    if explained_feature_indices is not None:
        if isinstance(explained_feature_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(explained_feature_indices))
            if invalid_indices.size:
                raise IndexError(
                    'The following column indices are invalid for the input '
                    'dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The explained_feature_indices parameter must be '
                            'a Python list or None.')

    if isinstance(fidelity_radius_percentage, int):
        if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100:
            raise ValueError('The fidelity_radius_percentage must be an '
                             'integer between 1 and 100.')
    else:
        raise TypeError('The fidelity_radius_percentage must be an integer '
                        'between 1 and 100.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number must be a positive integer.')
    else:
        raise TypeError('The samples_number must be an integer.')

    is_input_ok = True
    return is_input_ok
Esempio n. 16
0
    def occlude_segments_vectorised(
            self,
            vectorised_segments_subset: np.ndarray,
            image: Optional[np.ndarray] = None,
            colour: Optional[Union[str, int, RGBcolour]] = None) -> np.ndarray:
        """
        Generates multiple images with a selected subsets of segments occluded.

        The segments to be occluded are provided as boolean vectors;
        either a 1-D numpy array of length equal to the number of segments
        to produce a single occluded image, or a 2-D array where each row
        represents a separate occlusion pattern.
        In this format the n-th element or column corresponds to the the
        n+1 segment id;
        1 indicates that the segment should be preserved and 0 that it should
        be occluded.

        The occlusion is applied on top of the image used to initialise this
        class; alternatively, an external ``image`` of the same type and
        dimensions can be supplied.
        If a colouring strategy different to the one of the class is desired,
        it can be specified via the ``colour`` parameter.

        Parameters
        ----------
        vectorised_segments_subset : numpy.ndarray
            A 1-D boolean occlusion vector of the length equal to the number of
            segments or a 2-D boolean matrix of the (number of occlusion images
            to generate X number of segments) shape.
        image : numpy.ndarray, optional (default=None)
            If provided, this ``image`` will be occluded instead of the one
            used to initialise this class.
        colour : string, integer, tuple(integer, integer, integer), \
optional (default=None)
            A colour specifier.
            By default (``colour=None``) the colouring strategy of the class is
            used.
            See the documentation of the
            :func:`fatf.utils.data.occlusion.Occlusion.set_colouring_strategy`
            method for more details.

        Raises
        ------
        IncorrectShapeError
            The ``vectorised_segments_subset`` numpy array is neither 1- nor
            2-dimensional.
            The number of elements in ``vectorised_segments_subset`` (when it
            is 1-D) does not correspond to the number of segments.
            The number of columns in ``vectorised_segments_subset`` (when it is
            2-D) does not correspond to the number of segments.
            The input ``image`` is neither a 2- nor 3-dimensional numpy array.
            The the height, width or the number of channels in the ``image``
            array does not agree with the same parameters of the class image.
        TypeError
            The ``vectorised_segments_subset`` numpy array is not boolean.

        Returns
        -------
        image_occluded : numpy.ndarray
            A numpy array holding the image(s) with the selected subset(s) of
            segments occluded.
        """
        # pylint: disable=too-many-branches
        if image is None:
            canvas = self.image
        else:
            assert (  # yapf: disable
                fuds._validate_image_array(  # pylint: disable=protected-access
                    image, 'image')), 'Invalid image.'
            if image.shape != self.image.shape:
                raise IncorrectShapeError(
                    'The width, height or number of channels of the input '
                    'image does not agree with the same parameters of the '
                    'original image.')
            canvas = image

        if colour is None:
            colouring_strategy = self._colouring_strategy
        else:
            colouring_strategy = self._generate_colouring_strategy(colour)

        if fuav.is_structured_array(vectorised_segments_subset):
            raise TypeError('The vector representation of segments cannot be '
                            'a structured numpy array.')
        if not fuav.is_numerical_array(vectorised_segments_subset):
            raise TypeError('The vector representation of segments should be '
                            'a numerical numpy array.')
        if fuav.is_1d_array(vectorised_segments_subset):
            if vectorised_segments_subset.shape[0] != self.segments_number:
                raise IncorrectShapeError(
                    ('The number of elements ({}) in the vector '
                     'representation of segments should correspond to the '
                     'unique number of segments ({}).').format(
                         vectorised_segments_subset.shape[0],
                         self.segments_number))
            samples = 1
            vectorised_segments_subset = np.asarray(
                [vectorised_segments_subset])
        elif fuav.is_2d_array(vectorised_segments_subset):
            if vectorised_segments_subset.shape[1] != self.segments_number:
                raise IncorrectShapeError(
                    ('The number of columns ({}) in the vector representation '
                     'of segments should correspond to the unique number of '
                     'segments ({}).').format(
                         vectorised_segments_subset.shape[1],
                         self.segments_number))
            samples = vectorised_segments_subset.shape[0]
        else:
            raise IncorrectShapeError(
                'The vector representation of segments should be a 1- or '
                '2-dimensional numpy array.')
        _unique_entries = set(np.unique(vectorised_segments_subset).astype(
            int)).difference((0, 1))  # yapf: disable
        if _unique_entries:
            raise TypeError('The vector representation of segments should be '
                            'binary numpy array.')

        # image_occluded = canvas.copy()
        image_occluded = np.repeat(canvas[np.newaxis, :], samples, axis=0)
        for i, vec in enumerate(vectorised_segments_subset):
            # Get ids of segments to be occluded (0s) from a vector form
            # 1 is added as segments are numbered from 1, not 0
            segments_subset = np.where(vec == 0)[0] + 1
            occlusion_mask = fuds.get_segment_mask(segments_subset.tolist(),
                                                   self.segments)
            image_occluded[i, occlusion_mask] = colouring_strategy(
                occlusion_mask)
        if samples == 1:
            image_occluded = image_occluded[0]

        return image_occluded
Esempio n. 17
0
    def __init__(self,
                 data: np.ndarray,
                 local_explanation: bool = True,
                 model: object = None,
                 **kwargs: Any) -> None:
        """
        Initialises a tabular LIME wrapper.
        """
        # pylint: disable=too-many-branches,too-many-statements

        warnings.warn(
            'The LIME wrapper will be deprecated in FAT Forensics version '
            '0.0.3. Please consider using the TabularBlimeyLime explainer '
            'class implemented in the fatf.transparency.predictions.'
            'surrogate_explainers module instead. Alternatively, you may '
            'consider building a custom surrogate explainer using the '
            'functionality implemented in FAT Forensics -- see the *Tabular '
            'Surrogates* how-to guide for more details.', FutureWarning)

        valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS)
        invalid_params = set(kwargs.keys()).difference(valid_params)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        # Split parameters
        init_params = {
            key: kwargs[key]
            for key in kwargs if key in self._INIT_PARAMS
        }
        explain_params = {
            key: kwargs[key]
            for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS
        }

        # Check data
        if not fuav.is_2d_array(data):
            raise IncorrectShapeError('The data parameter must be a '
                                      '2-dimensional numpy array.')
        if not fuav.is_numerical_array(data):
            raise ValueError('LIME does not support non-numerical data '
                             'arrays.')

        # Honour native local explanation keyword
        local_explanation_keyword = 'sample_around_instance'
        if local_explanation_keyword not in init_params:
            init_params[local_explanation_keyword] = local_explanation

        # Sort out a structured data array
        if fuav.is_structured_array(data):
            categorical_indices_keyword = 'categorical_features'
            categorical_indices = init_params.get(categorical_indices_keyword,
                                                  None)

            if categorical_indices is not None:
                if isinstance(categorical_indices, list):
                    categorical_indices = np.array(categorical_indices)
                elif isinstance(categorical_indices, np.ndarray):
                    pass
                else:
                    raise TypeError('The {} parameter either has to be a '
                                    'list, a numpy array or None.'.format(
                                        categorical_indices_keyword))

                if not fuav.is_1d_array(categorical_indices):
                    raise IncorrectShapeError(
                        '{} array/list is not '
                        '1-dimensional.'.format(categorical_indices_keyword))
                if not fuav.is_textual_array(categorical_indices):
                    raise ValueError('Since {} is an array of indices for '
                                     'a structured array, all of its elements '
                                     'should be strings.'.format(
                                         categorical_indices_keyword))

                # Check categorical indices
                if not fuat.are_indices_valid(data, categorical_indices):
                    raise ValueError(
                        'Indices given in the {} parameter '
                        'are not valid for the input data '
                        'array.'.format(categorical_indices_keyword))
                init_params[categorical_indices_keyword] = np.array(
                    [data.dtype.names.index(y) for y in categorical_indices])

            data = fuat.as_unstructured(data)

        # Get a LIME tabular explainer
        self.mode = init_params.get('mode', 'classification')
        if self.mode not in ['classification', 'regression']:
            raise ValueError("The mode must be either 'classification' or "
                             "'regression'. '{}' given.".format(self.mode))

        self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer(
            data, **init_params)

        # Check the model
        self.model = model
        self.model_is_probabilistic = False
        if model is not None:
            if fumv.check_model_functionality(
                    model, require_probabilities=True, suppress_warning=True):
                self.model_is_probabilistic = True
            elif fumv.check_model_functionality(
                    model, require_probabilities=False, suppress_warning=True):
                self.model_is_probabilistic = False
                logger.warning('The model can only be used for LIME in a '
                               'regressor mode.')
            else:
                raise IncompatibleModelError('LIME requires a model object to '
                                             'have a fit method and '
                                             'optionally a predict_proba '
                                             'method.')

        # Check the predictive function and memorise parameters that may be
        # useful for explaining an instance
        pred_fn_name = 'predict_fn'
        if pred_fn_name in explain_params:
            prediction_function = explain_params[pred_fn_name]
            # Make sure that its a function
            if not callable(prediction_function):
                raise TypeError('The {} parameter is not callable -- it has '
                                'to be a function.'.format(pred_fn_name))

            # Warn the user if both a model and a function are provided
            if self.model is not None:
                warnings.warn(
                    'Since both, a model and a predictive function, are '
                    'provided only the latter will be used.', UserWarning)

        self.explain_instance_params = explain_params
Esempio n. 18
0
def test_highest_weights(caplog):
    """
    Tests :func:`fatf.utils.data.feature_choice.sklearn.highest_weights`.
    """
    assert len(caplog.records) == 0
    fatf.setup_random_seed()
    assert len(caplog.records) == 2
    assert caplog.records[0].levelname == 'INFO'
    assert caplog.records[0].getMessage().startswith('Seeding RNGs ')
    assert caplog.records[1].levelname == 'INFO'
    assert caplog.records[1].getMessage() == 'Seeding RNGs with 42.'

    # Weights and no-weights
    weights = np.ones((NUMERICAL_NP_ARRAY.shape[0], ))
    # Classic array -- weights
    features = fudfs.highest_weights(NUMERICAL_NP_ARRAY,
                                     NUMERICAL_NP_ARRAY_TARGET, weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([1, 2]))
    # Structured array -- no-weights
    features = fudfs.highest_weights(
        NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_number=2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['b', 'c']))
    #
    # Selecting exactly 4 features -- no need for Lasso
    features = fudfs.highest_weights(NUMERICAL_NP_ARRAY,
                                     NUMERICAL_NP_ARRAY_TARGET, weights, 4)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1, 2, 3]))
    # Selecting more than 4 features
    with pytest.warns(UserWarning) as warning:
        features = fudfs.highest_weights(NUMERICAL_STRUCT_ARRAY,
                                         NUMERICAL_NP_ARRAY_TARGET, weights, 5)
    assert len(warning) == 1
    assert str(warning[0].message) == FEATURE_INDICES_WARNING
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['a', 'b', 'c', 'd']))
    #
    # No features number -- just percentage
    features = fudfs.highest_weights(
        NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=50)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([1, 2]))
    # No features number -- just percentage -- too small no features selected
    assert len(caplog.records) == 2
    features = fudfs.highest_weights(
        NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=24)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([2]))
    assert len(caplog.records) == 3
    assert caplog.records[2].levelname == 'WARNING'
    assert caplog.records[2].getMessage() == FEATURE_PERCENTAGE_LOG

    # Small weights
    weights = np.array([1, 1, 100, 1, 1, 1]) * 1e-20
    features = fudfs.highest_weights(NUMERICAL_NP_ARRAY,
                                     NUMERICAL_NP_ARRAY_TARGET, weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 1]))

    # Another selection
    weights = np.array([100, 1, 1, 1, 1, 1])
    features = fudfs.highest_weights(NUMERICAL_NP_ARRAY,
                                     NUMERICAL_NP_ARRAY_TARGET, weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([2, 3]))
    features = fudfs.highest_weights(NUMERICAL_STRUCT_ARRAY,
                                     NUMERICAL_NP_ARRAY_TARGET, weights, 2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array(['c', 'd']))

    # Custom data
    features = fudfs.highest_weights(
        np.array([[1, 2, 3], [2, 2, 3], [3, 2, 3], [4, 2, 3]]),
        np.array([1, 2, 3, 4]),
        features_number=2)
    assert fuav.is_1d_array(features)
    assert np.array_equal(features, np.array([0, 2]))
    assert len(caplog.records) == 3
Esempio n. 19
0
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Fits the model.

        Parameters
        ----------
        X : numpy.ndarray
            The KNN training data.
        y : numpy.ndarray
            The KNN training labels.

        Raises
        ------
        IncorrectShapeError
            Either the ``X`` array is not 2-dimensional, the ``y`` array is not
            1-dimensional, the number of rows in ``X`` is not the same as the
            number of elements in ``y`` or the ``X`` array has 0 rows or 0
            columns.
        PrefittedModelError
            Trying to fit the model when it has already been fitted. Usually
            raised when calling the ``fit`` method for the second time without
            clearing the model first.
        TypeError
            Trying to fit a KNN predictor in a regressor mode with
            non-numerical target variable.
        """
        if self._is_fitted:
            raise PrefittedModelError('This model has already been fitted.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('The training data must be a 2-'
                                      'dimensional array.')
        if not fuav.is_1d_array(y):
            raise IncorrectShapeError('The training data labels must be a 1-'
                                      'dimensional array.')
        if X.shape[0] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one data point.')
        # If the array is structured the fuav.is_2d_array function takes care
        # of checking whether there is at least one column
        if not fuav.is_structured_array(X) and X.shape[1] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one feature.')
        if X.shape[0] != y.shape[0]:
            raise IncorrectShapeError('The number of samples in X must be the '
                                      'same as the number of labels in y.')
        if not self._is_classifier and not fuav.is_numerical_array(y):
            raise TypeError('Regressor can only be fitted for a numerical '
                            'target vector.')

        numerical_indices, categorical_indices = fuat.indices_by_type(X)
        self._numerical_indices = numerical_indices
        self._categorical_indices = categorical_indices

        self._is_structured = fuav.is_structured_array(X)
        self._X = X
        self._y = y

        if self._is_classifier:
            unique_y, unique_y_counts = np.unique(self._y, return_counts=True)
            # Order labels lexicographically.
            unique_y_sort_index = np.argsort(unique_y)
            self._unique_y = unique_y[unique_y_sort_index]
            self._unique_y_counts = unique_y_counts[unique_y_sort_index]

            # How many other labels have the same count.
            top_y_index = self._unique_y_counts == np.max(
                self._unique_y_counts)
            top_y_unique_sorted = np.sort(self._unique_y[top_y_index])
            self._majority_label = top_y_unique_sorted[0]

            self._unique_y_probabilities = (self._unique_y_counts /
                                            self._y.shape[0])
        else:
            self._majority_label = self._y.mean()
            self._unique_y = np.ndarray((0, ))
            self._unique_y_counts = np.ndarray((0, ))
            self._unique_y_probabilities = np.ndarray((0, ))

        self._X_n = self._X.shape[0]
        self._is_fitted = True
Esempio n. 20
0
def counterfactual_fairness_check(unfair_counterfactuals: Optional[
    np.ndarray] = None,
                                  distances: Optional[np.ndarray] = None,
                                  threshold: Optional[float] = None) -> bool:
    """
    Checks for counterfactual fairness using a counterfactual fairness arrays.

    There are two different approaches to evaluate counterfactual fairness. The
    first one is to take the ``distances`` to the counterfactual examples and
    see whether there are any that exceed a certain ``threshold`` in which case
    a given instance is considered to be treated unfairly. Alternatively by
    using the ``unfair_counterfactuals`` array this function checks whether
    there are any unfair counterfactual instances. In case all the input
    parameters are given **the distance-based approach takes the precedence**.

    Parameters
    ----------
    unfair_counterfactuals : numpy.ndarray, optional (default=None)
        A 2-dimensional numpy array with counterfactual examples that expose
        unfairness of a prediction.
    distances : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with .
    threshold : number, optional (default=None)
        A numerical threshold above which a counterfactual instance is too far,
        therefore it is considered to be an exemplar of individual unfairness.

    Raises
    ------
    IncorrectShapeError
        The ``unfair_counterfactuals`` parameter is not a 2-dimensional array.
        The ``distances`` parameter is not a 1-dimensional array.
    RuntimeError
        Either of the required input parameters were not given:
        ``unfair_counterfactuals`` or ``distances`` and ``threshold``.
    TypeError
        The ``threshold`` parameter is not a number.
    ValueError
        The ``distances`` array is not purely numerical.

    Returns
    -------
    counterfactually_unfair : boolean
        ``True`` if there are any counterfactually unfair instances, ``False``
        otherwise.
    """
    if distances is not None and threshold is not None:
        if not fuav.is_1d_array(distances):
            raise IncorrectShapeError('The distances parameter has to be a '
                                      '1-dimensional array.')
        if not fuav.is_numerical_array(distances):
            raise ValueError('The distances array has to be purely numerical.')
        if not isinstance(threshold, Number):
            raise TypeError('The threshold parameter has to be a number.')

        counterfactually_unfair = (distances > threshold).any()
    elif unfair_counterfactuals is not None:
        if not fuav.is_2d_array(unfair_counterfactuals):
            raise IncorrectShapeError('The unfair counterfactuals parameter '
                                      'has to be a 2-dimensional numpy array.')
        counterfactually_unfair = bool(unfair_counterfactuals.size)
    else:
        raise RuntimeError('Either of the two is required to run this '
                           'function: unfair_counterfactuals parameter or '
                           'both distances and threshold parameters.')

    return counterfactually_unfair
Esempio n. 21
0
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray,
                  protected_features: List[Index]) -> np.ndarray:
    """
    Checks for systemic bias in a dataset.

    This function checks whether there exist data points that share the same
    unprotected features but differ in protected features. For all of these
    instances their label (ground truth) will be checked and if it is
    different, a particular data points pair will be indicated to be biased.
    This dependency is represented as a boolean, square numpy array that shows
    whether systemic bias exists (``True``) for any pair of data points.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be evaluated for systemic bias.
    ground_truth : numpy.ndarray
        The labels corresponding to the dataset.
    protected_features : List[column index]
        A list of column indices in the dataset that hold protected attributes.

    Raises
    ------
    IncorrectShapeError
        The dataset is not a 2-dimensional numpy array, the ground truth is not
        a 1-dimensional numpy array or the number of rows in the dataset is not
        equal to the number of elements in the ground truth array.
    IndexError
        Some of the column indices given in the ``protected_features`` list are
        not valid for the input dataset.
    TypeError
        The ``protected_features`` parameter is not a list.
    ValueError
        There are duplicate values in the protected feature indices list.

    Returns
    -------
    systemic_bias_matrix : numpy.ndarray
        A square, diagonally symmetrical and boolean numpy array that indicates
        which pair of data point share the same unprotected features but differ
        in protected features and the ground truth annotation.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The dataset should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth should be a 1-dimensional '
                                  'numpy array.')
    if ground_truth.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of rows in the dataset and the '
                                  'ground truth should be equal.')
    if isinstance(protected_features, list):
        pfa = np.asarray(protected_features)
        if not fuat.are_indices_valid(dataset, pfa):
            iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist()
            raise IndexError('The following protected feature indices are not '
                             'valid for the dataset array: {}.'.format(iid))
        if len(set(protected_features)) != len(protected_features):
            raise ValueError('Some of the protected indices are duplicated.')
    else:
        raise TypeError('The protected_features parameter should be a list.')

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        unprotected_features_array = recfn.drop_fields(dataset,
                                                       protected_features)
        if unprotected_features_array is None:
            unprotected_features_array = np.ones((dataset.shape[0], ),
                                                 dtype=[('ones', int)])
    else:
        unprotected_features_array = np.delete(dataset,
                                               protected_features,
                                               axis=1)
        if not unprotected_features_array.size:
            unprotected_features_array = np.ones((dataset.shape[0], 1))

    assert unprotected_features_array.shape[0] == dataset.shape[0], \
        'Must share rows number.'

    systemic_bias_columns = []
    for i in range(unprotected_features_array.shape[0]):
        if is_structured:
            equal_unprotected = (
                unprotected_features_array == unprotected_features_array[i])
        else:
            equal_unprotected = np.apply_along_axis(
                np.array_equal, 1, unprotected_features_array,
                unprotected_features_array[i, :])

        equal_unprotected_indices = np.where(equal_unprotected)

        # Check whether the ground truth is different for these rows
        equal_unprotected[equal_unprotected_indices] = (
            ground_truth[i] != ground_truth[equal_unprotected_indices])
        systemic_bias_columns.append(equal_unprotected)

    systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1)
    assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \
        'The matrix has to be diagonally symmetric.'
    assert not np.diagonal(systemic_bias_matrix).any(), \
        'Same elements cannot be systemically biased.'
    return systemic_bias_matrix
Esempio n. 22
0
    def __init__(self,
                 clf: sklearn.base.BaseEstimator,
                 feature_names: Optional[List[str]] = None,
                 class_names: Optional[List[str]] = None) -> None:
        """
        Initialises the ``SKLearnExplainer`` class.
        """
        # Validate the input
        assert _validate_input(clf, feature_names,
                               class_names), 'Invalid init parameters.'
        self.clf = clf
        self.feature_names = feature_names
        self.class_names = class_names

        # Check whether the model is of the right type and is fitted
        assert self._validate_kind_fitted(), 'Unfitted or wrong type model.'

        # Classifier or regressor
        self.is_classifier = self._is_classifier()
        assert isinstance(self.is_classifier, bool), 'Has to be boolean.'

        # The number of features (number of columns in a data array) expected
        # by the classifier
        self.features_number = self._get_features_number()
        if self.features_number is not None:
            assert isinstance(self.features_number, int), 'Wrong type.'

        # Get the list of classes that the predictive model can output
        self.classes_array = self._get_classes_array()
        if self.classes_array is not None:
            assert isinstance(self.classes_array, np.ndarray), 'Bad type.'
        if self.classes_array is not None:
            assert fuav.is_1d_array(self.classes_array), 'Must be 1-D array.'
            assert (fuav.is_numerical_array(self.classes_array)
                    or fuav.is_textual_array(self.classes_array)), 'Bad type.'

        # A regressor must not have class names
        if not self.is_classifier:
            assert self.classes_array is None and self.class_names is None, \
                "Regressor's class_names and classes_array must both be None."

        # Validate feature names length
        if self.feature_names is None:
            if self.features_number is not None:
                logger.info('Generating missing feature names from the number '
                            'of features using "feature %d" pattern.')
                self.feature_names = [
                    'feature {}'.format(i) for i in range(self.features_number)
                ]
        else:
            if self.features_number is None:
                warnings.warn(
                    'Cannot validate the length of feature names list since '
                    'the _get_features_number method '
                    'returned None.', UserWarning)
            else:
                if len(self.feature_names) != self.features_number:
                    raise ValueError('The length of the feature_names list '
                                     'is different than the number of '
                                     'features extracted from the classifier.')

        # Validate class names length
        if self.class_names is None:
            if self.classes_array is not None:
                logger.info('Generating missing class names from the array of '
                            'classes output by the classifier using '
                            '"class %s" pattern.')
                self.class_names = [
                    'class {}'.format(i) for i in self.classes_array
                ]
        else:
            if self.classes_array is None:
                warnings.warn(
                    'Cannot validate the length of class names list since the '
                    '_get_classes_array method returned None.', UserWarning)
            else:
                if self.classes_array.shape[0] != len(self.class_names):
                    raise ValueError('The length of the class_names list is '
                                     'different than the length of the '
                                     'classes array extracted from the '
                                     'classifier.')
Esempio n. 23
0
def group_by_column(
    dataset: np.ndarray,
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None
) -> Tuple[List[List[int]], List[str]]:
    """
    Groups row indices of an array based on value grouping of a chosen column.

    If selected column is numerical, by default the values are grouped into 5
    bins equally distributed between the minimum and the maximum value of the
    column. The number of bins can be changed with the
    ``numerical_bins_number`` if desired. Alternatively, the exact bin
    boundaries can be given via the ``groupings`` parameter.

    For categorical columns, the default binning is one bin for every unique
    value in the selected column. This behaviour can be changed by providing
    the ``groupings`` parameter, where multiple values can be selected to
    create one bin.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be used for grouping the row indices.
    column_index : Union[string, integer]
        A column index (a string for structured numpy arrays or an integer for
        unstructured arrays) of the column based on which the row indices will
        be partitioned.
    groupings : List[Union[number, Tuple[string]]], optional (default=None)
        A list of user-specified groupings for the selected column. The default
        grouping for categorical (textual) columns is splitting them by all the
        unique values therein. The numerical columns are, by default, binned
        into 5 bins (see the ``numerical_bins_number`` parameter) uniformly
        distributed between the minimum and the maximum value of the column.
        To introduce custom binning for a categorical column ``groupings``
        parameter should be a list of tuples, where every tuple represents a
        single group. For example, a column with the following unique values
        ``['a', 'b', 'c', 'd']`` can be split into two groups: ``['a', 'd']``
        and ``['b', 'c']`` by providing ``[('a', 'd'), ('b', 'c')]`` grouping.
        For numerical columns custom grouping should be introduced as a list of
        bucket boundaries. Every bucket includes all the values that are
        **less or equal** to the specified bucket boundary and greater than the
        previous boundary if one is given.
    numerical_bins_number : integer, optional (default=5)
        The number of bins used for default binning of numerical columns.
    treat_as_categorical : boolean, optional (default=None)
        Whether the selected column should be treated as a categorical or
        numerical feature. If set to ``None``, the type of the column will be
        inferred from the data therein. If set to ``False``, the column will be
        treated as numerical unless it is string-based in which case a warning
        will be emitted and the column will be treated as numerical despite
        this setting. Finally, if set to ``True``, the column will be treated
        as categorical.

    Warns
    -----
    UserWarning
        When grouping is done on a categorical column a warning is emitted when
        some of the values in that column are not accounted for, i.e. they are
        not included in the ``groupings`` parameter. Also, if some of the rows
        are not included in any of the groupings, a warning is shown. Missing
        row indices may be a result of some of the values being not-a-number
        for a numerical column and missing some of the unique values for a
        categorical column. ``treat_as_categorical`` parameter is set to
        ``False``, however the feature selected is string-based
        (i.e. categorical), therefore cannot be treated as a numerical one.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not 2-dimensional.
    IndexError
        The supplied ``column_index`` is not valid for the input ``dataset``.
    TypeError
        The column index is neither a string nor an integer. The numerical bins
        number is not an integer. The ``groupings`` parameter is neither a list
        not ``None``. One of the grouping bin boundaries (for a numerical
        feature column) is not a number. One of the groupings (for a
        categorical feature column) is not a tuple. The
        ``treat_as_categorical`` parameter is neither a boolean nor ``None``.
    ValueError
        The input ``dataset`` is not of a base type. The numerical bins number
        is less than 2. The ``groupings`` list is empty. The numbers in the
        ``groupings`` parameter are not monotonically increasing (for a
        numerical column). There are duplicate values shared among tuples in
        the ``grouping`` parameter or one of the values does not appear in the
        selected column (for a categorical column).

    Returns
    -------
    indices_per_bin : List[List[integer]]
        A list of lists with the latter one holding row indices of a particular
        group.
    bin_names : List[string]
        A list holding a description of each group.
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input array should be 2-dimensional.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    # Check index validity
    if isinstance(column_index, (str, int)):
        if not fuat.are_indices_valid(dataset, np.array([column_index])):
            raise IndexError('*{}* is not a valid column index for the input '
                             'dataset.'.format(column_index))
    else:
        raise TypeError('The column index can either be a string or an '
                        'integer.')

    # Check the number of numerical bins
    if isinstance(numerical_bins_number, int):
        if numerical_bins_number < 2:
            raise ValueError('The numerical_bins_number needs to be at least '
                             '2.')
    else:
        raise TypeError('The numerical_bins_number parameter has to be an '
                        'integer.')

    # Check treat_as_categorical
    if treat_as_categorical is not None:
        if not isinstance(treat_as_categorical, bool):
            raise TypeError('The treat_as_categorical parameter has to be a '
                            'boolean.')

    if fuav.is_structured_array(dataset):
        column = dataset[column_index]
    else:
        column = dataset[:, column_index]
    assert fuav.is_1d_array(column), 'This must be a 1D numpy array.'

    # Get a list of all the row indices
    all_row_indices = set(range(column.shape[0]))

    indices_per_bin = []
    bin_names = []

    is_numerical_column = fuav.is_numerical_array(column)
    is_categorical_column = fuav.is_textual_array(column)
    assert is_numerical_column is not is_categorical_column, \
        'The column must be a base array.'

    # Sort out numerical/categorical column treatment
    if treat_as_categorical is None:
        go_numerical = is_numerical_column
    else:
        if treat_as_categorical:
            go_numerical = False
        else:  # Treat as numerical
            if is_numerical_column:
                go_numerical = True
            else:  # Is not numerical
                warnings.warn(
                    'Selected feature is categorical, therefore cannot be '
                    'treated as numerical. The feature will be treated as '
                    'categorical despite the treat_as_categorical parameter '
                    'set to False.', UserWarning)
                go_numerical = False

    if go_numerical:
        if groupings is None:
            # Get default bins
            bins = np.linspace(column.min(),
                               column.max(),
                               num=numerical_bins_number,
                               endpoint=False)[1:].tolist()
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A numerical grouping list has to contain at '
                                 'least one element.')

            # Every element in the groupings list must be a number
            for i, number in enumerate(groupings):
                if not isinstance(number, Number):
                    raise TypeError('For a numerical column all of the '
                                    'grouping items must be numbers. *{}* '
                                    'is not a number.'.format(number))
                if i != 0:
                    if number <= groupings[i - 1]:
                        raise ValueError('The numbers in the groupings list '
                                         'have to be monotonically '
                                         'increasing.')
            bins = groupings
        else:
            raise TypeError('Since a numerical column was chosen the grouping '
                            'must be a list of bin boundaries or None.')

        lower_edge = 'x <= {}'
        middle = '{} < x <= {}'
        upper_edge = '{} < x'

        indices_seen_so_far = set()  # type: Set[int]

        for i, edge in enumerate(bins):
            if i == 0:
                indices = np.where(column <= edge)[0].tolist()

                indices_per_bin.append(indices)
                bin_names.append(lower_edge.format(edge))
            else:
                edge_lower = bins[i - 1]

                indices_l = set(np.where(column <= edge)[0].tolist())
                indices_u = set(np.where(column > edge_lower)[0].tolist())
                indices = list(indices_l.intersection(indices_u))

                indices_per_bin.append(indices)
                bin_names.append(middle.format(edge_lower, edge))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

        assert bins, 'If bins is empty, i and edge will not be defined.'
        # pylint: disable=undefined-loop-variable
        indices = np.where(column > edge)[0].tolist()

        indices_per_bin.append(indices)
        bin_names.append(upper_edge.format(edge))

        assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
        indices_seen_so_far = indices_seen_so_far.union(indices)
    else:
        unique_elements = np.sort(np.unique(column)).tolist()

        if groupings is None:
            bins = [(i, ) for i in unique_elements]
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A categorical grouping list has to contain '
                                 'at least one element.')

            values_seen_so_far = set()  # type: Set[str]

            # Every element in the groupings list must be a valid tuple
            for value_tuple in groupings:
                if not isinstance(value_tuple, tuple):
                    raise TypeError('For a categorical column all of the '
                                    'grouping items must be tuples. *{}* '
                                    'is not a tuple.'.format(value_tuple))
                for value in value_tuple:
                    if value not in unique_elements:
                        raise ValueError('*{}* value is not present in the '
                                         'selected column.'.format(value))

                if values_seen_so_far.intersection(value_tuple):
                    raise ValueError('Some values are duplicated across '
                                     'tuples.')
                values_seen_so_far = values_seen_so_far.union(value_tuple)

            unaccounted_values = set(unique_elements).difference(
                values_seen_so_far)
            if unaccounted_values:
                warnings.warn(
                    'The following values in the selected column were not '
                    'accounted for in the grouping '
                    'tuples:\n{}.'.format(unaccounted_values), UserWarning)

            bins = [tuple(sorted(i)) for i in groupings]  # type: ignore
            bins = sorted(bins)
        else:
            raise TypeError('Since a categorical column was chosen the '
                            'grouping must be a list of tuples representing '
                            'categorical values grouping or None for the '
                            'default grouping.')

        indices_seen_so_far = set()

        for bin_values in bins:
            indices = set()
            for value in bin_values:
                vid = np.where(column == value)[0].tolist()
                indices = indices.union(vid)

            indices_per_bin.append(list(indices))
            bin_names.append('{}'.format(bin_values))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

    # Validate that all of the row indices were accounted for
    missed_indices = all_row_indices.difference(indices_seen_so_far)
    if missed_indices:
        warnings.warn(
            'The following row indices could not be accounted for:\n{}.\n For '
            'a numerical column there may have been some numpy.nan therein. '
            'For a categorical column some of the column values were probably '
            'not specified in the grouping, in which case there should be a '
            'separate user warning.'.format(missed_indices), UserWarning)

    return indices_per_bin, bin_names