Ejemplo n.º 1
0
def hamming_array_distance(X: np.ndarray, Y: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the
    same width.

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of Hamming distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    if not fuav.is_textual_array(X):
        raise ValueError('The X array should be textual.')
    if not fuav.is_textual_array(Y):
        raise ValueError('The Y array should be textual.')

    # Transform the arrays to unstructured
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name
    Y_array = fuat.as_unstructured(Y)  # pylint: disable=invalid-name

    # Compare shapes
    if X_array.shape[1] != Y_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(hamming_point_distance, 1, X_array,
                                          Y_array, **kwargs)
    return distance_matrix
Ejemplo n.º 2
0
def binary_array_distance(X: np.ndarray, Y: np.ndarray,
                          **kwargs: bool) -> np.ndarray:
    """
    Calculates the binary distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numpy arrays of the same
    width.

    Either of the input arrays is not of a base dtype. (See
    :func:`fatf.utils.array.validation.is_base_array` function description for
    the explanation of a base dtype.)

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.binary_distance` function responsible for
        calculating the binary distance.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of binary distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    # Transform the arrays to unstructured
    X_array = fuat.as_unstructured(X)
    Y_array = fuat.as_unstructured(Y)

    # Compare shapes
    if X_array.shape[1] != Y_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(binary_point_distance, 1, X_array,
                                          Y_array, **kwargs)
    return distance_matrix
Ejemplo n.º 3
0
def euclidean_array_distance(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    """
    Calculates the Euclidean distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the
    same width.

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional and purely numerical.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional and purely numerical.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of Euclidean distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    if not fuav.is_numerical_array(X):
        raise ValueError('The X array should be purely numerical.')
    if not fuav.is_numerical_array(Y):
        raise ValueError('The Y array should be purely numerical.')

    # Transform the arrays to unstructured
    Y_array = fuat.as_unstructured(Y)  # pylint: disable=invalid-name
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    # Compare shapes
    if Y_array.shape[1] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(euclidean_point_distance, 1, X_array,
                                          Y_array)

    return distance_matrix
Ejemplo n.º 4
0
def hamming_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and non-numerical) used to
        calculate the distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and non-numerical) to which
        rows the distances are calculated.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distances : numpy.ndarray
        An array of Hamming distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')
    if not fuav.is_textual_array(X_array):
        raise ValueError('The X array should be textual.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(hamming_distance, 1, X_array, y_array,
                                    **kwargs)
    return distances
Ejemplo n.º 5
0
def _validate_data_header(X: np.ndarray, y: np.ndarray, n_samples: int,
                          n_features: int, y_names: np.ndarray) -> bool:
    """
    Checks if reading in data is consistent by ... the csv header.

    For details on valid header formatting see the
    :func:`fatf.utils.datasets.load_data` documentation.

    Parameters
    ----------
    X : numpy.ndarray
        Array read in from ``numpy.genfromtxt``.
    y : numpy.ndarray
        Target variable indicating which class each sample in ``X`` belongs to.
    n_samples : integer
        Number of samples expected in ``X`` and ``y``.
    n_features : integer
        Number of features expected in ``X``.
    y_names : numpy.ndarray
        Unique class names of the target variable ``y``.

    Raises
    ------
    ValueError
        The number of samples in ``X`` and ``y`` or the number of features in
        the dataset ``X`` is not consistent with the header. Also, raised when
        the number of unique classes in ``y`` is not consistent with the
        header.

    Returns
    -------
    is_consistent : boolean
        True if the header is consistent with the data, False otherwise.
    """
    # pylint: disable=invalid-name
    assert fuav.is_2d_array(X), 'X has to be a 2-dimensional array.'
    assert fuav.is_1d_array(y), 'y has to be a 1-dimensional array.'
    assert fuav.is_1d_array(y_names), 'y_names must be a 1-dimensional array.'

    is_consistent = False
    if X.shape[0] != n_samples:
        raise ValueError('The number of samples in the dataset is not '
                         'consistent with the header.')
    # Use len(X[0]) in case X is structured array.
    if len(X[0]) != n_features:
        raise ValueError('The number of features in the dataset is not '
                         'consistent with the header.')
    if y.shape[0] != n_samples:
        raise ValueError('The number of labels (target variables) is not '
                         'consistent with the header.')
    if y_names.shape[0]:
        if y_names.shape[0] != np.unique(y).shape[0]:
            raise ValueError('The number of classes is not consistent with '
                             'the header.')

    is_consistent = True
    return is_consistent
Ejemplo n.º 6
0
def structured_to_unstructured(
        structured_array: np.ndarray,
        **kwargs: Optional[np.dtype]) -> np.ndarray:  # pragma: no cover
    """
    Calls either local or numpy's structured_to_unstructured function.

    numpy 1.16.0 has introduced
    :func:`numpy.lib.recfunctions.structured_to_unstructured` function. To
    ensure backwards compatibility up to numpy 1.9.0 this package implements
    its own version of this function
    (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`).
    This function calls the latter if numpy version below 1.16.0 is installed.
    However, if numpy 1.16.0 or above is detected, numpy's implementation is
    used instead.

    For the description of ``structured_to_unstructured`` functionality either
    refer to the corresponding numpy
    (:func:`numpy.lib.recfunctions.structured_to_unstructured`) or local
    (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`)
    documentation.

    .. warning:: Since this function either calls a local implementation or a
       builtin numpy function there may be some inconsistencies in its
       behaviour. One that we are aware of is conversion of arrays that contain
       ``'V'`` -- raw data (void), ``'O'`` -- (Python) objects, ``'M'`` --
       datetime or ``'m'`` -- timedelta dtypes. These types are not supported
       by the local implementation, however some of them are supported by the
       numpy built-in, e.g. the ``'V'`` type.

    Parameters
    ----------
    structured_array : numpy.ndarray
        A structured numpy array to be converted into a plane numpy array.
    **kwargs : Optional[numpy.dtype]
        Named parameters that are passed to the appropriate structured to
        unstructured array converter. These parameters are ignored when calling
        the local implementation
        (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`).

    Returns
    -------
    classic_array : numpy.ndarray
        A classic numpy array representation of the ``structured_array`` with
        the most generic type out of the input array's dtypes.
    """
    # pylint: disable=no-member
    if _LOCAL_STRUCTURED_TO_UNSTRUCTURED:
        classic_array = fatf_structured_to_unstructured(structured_array)
    else:
        classic_array = recfn.structured_to_unstructured(
            structured_array, **kwargs)
        if (fuav.is_2d_array(structured_array)
                and fuav.is_1d_array(classic_array)):
            classic_array = classic_array.reshape(
                (structured_array.shape[0], 1))
    return classic_array
Ejemplo n.º 7
0
def euclidean_point_distance(y: Union[np.ndarray, np.void],
                             X: np.ndarray) -> np.ndarray:
    """
    Calculates the Euclidean distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and purely numerical) used to
        calculate distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and purely numerical) to which
        rows distances are calculated.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distances : numpy.ndarray
        An array of Euclidean distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_numerical_array(y_array):
        raise ValueError('The y array should be purely numerical.')
    if not fuav.is_numerical_array(X_array):
        raise ValueError('The X array should be purely numerical.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(euclidean_distance, 1, X_array, y_array)
    return distances
Ejemplo n.º 8
0
def _validate_get_distance(
        data_array: np.ndarray,
        distance_function: Callable[[np.ndarray, np.ndarray], float]) -> bool:
    """
    Validates ``data_array`` and ``distance_function`` parameters.

    Parameters
    ----------
    data_array : numpy.ndarray
        A 2-dimensional numpy array.
    distance_function : Callable[[numpy.ndarray, numpy.ndarray], number]
        A Python function that takes as an input two 1-dimensional numpy arrays
        of equal length and outputs a number representing a distance between
        them.

    Raises
    ------
    AttributeError
        The distance function does not require exactly two parameters.
    IncorrectShapeError
        The data array is not a 2-dimensional numpy array.
    TypeError
        The data array is not of a base type (numbers and/or strings). The
        distance function is not a Python callable (function).

    Returns
    -------
    is_valid : boolean
        ``True`` if the parameters are valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(data_array):
        raise IncorrectShapeError('The data_array has to be a 2-dimensional '
                                  '(structured or unstructured) numpy array.')
    if not fuav.is_base_array(data_array):
        raise TypeError('The data_array has to be of a base type (strings '
                        'and/or numbers).')

    if callable(distance_function):
        required_param_n = 0
        params = inspect.signature(distance_function).parameters
        for param in params:
            if params[param].default is params[param].empty:
                required_param_n += 1
        if required_param_n != 2:
            raise AttributeError('The distance function must require exactly '
                                 '2 parameters. Given function requires {} '
                                 'parameters.'.format(required_param_n))
    else:
        raise TypeError('The distance function should be a Python callable '
                        '(function).')

    is_valid = True
    return is_valid
Ejemplo n.º 9
0
def validate_binary_matrix(binary_array: np.ndarray,
                           name: Optional[str] = None) -> bool:
    """
    Validates a binary, square and symmetric numpy array.

    Parameters
    ----------
    binary_array : numpy.ndarray
        A square (equal number of rows and columns), boolean  symmetric numpy
        array.

    Raises
    ------
    IncorrectShapeError
        The matrix is not 2-dimensional or square.
    TypeError
        The matrix is not of boolean type.
    ValueError
        The matrix is a structured numpy array or is not diagonally symmetric.

    Returns
    -------
    is_valid : boolean
        ``True`` if the matrix is valid, ``False`` otherwise.
    """
    if name is None:
        name = ''
    else:
        assert isinstance(name, str), 'The name parameter has to be string.'
        name = name.strip()
        name = '{} '.format(name) if name else name
    is_valid = False

    if not fuav.is_2d_array(binary_array):
        raise IncorrectShapeError('The {}matrix has to be '
                                  '2-dimensional.'.format(name))
    if fuav.is_structured_array(binary_array):
        raise ValueError('The {}matrix cannot be a structured numpy '
                         'array.'.format(name))
    if binary_array.dtype != bool:
        raise TypeError('The {}matrix has to be of boolean '
                        'type.'.format(name))
    if binary_array.shape[0] != binary_array.shape[1]:
        raise IncorrectShapeError('The {}matrix has to be '
                                  'square.'.format(name))
    if (not np.array_equal(binary_array, binary_array.T)
            or np.diagonal(binary_array).any()):
        raise ValueError('The {}matrix has to be diagonally '
                         'symmetric.'.format(name))

    is_valid = True
    return is_valid
Ejemplo n.º 10
0
def _input_is_valid(dataset: np.ndarray,
                    model: object,
                    feature_index: Union[int, str],
                    treat_as_categorical: Optional[bool],
                    steps_number: Optional[int]) -> bool:  # yapf: disable
    """
    Validates input parameters of Individual Conditional Expectation function.

    For the input parameter description, warnings and exceptions please see the
    documentation of the :func`fatf.transparency.model.feature_influence.
    individual_conditional_expectation` function.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'array.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input dataset must only contain base types '
                         '(textual and numerical).')

    if not fumv.check_model_functionality(model, require_probabilities=True):
        raise IncompatibleModelError('This functionality requires the model '
                                     'to be capable of outputting '
                                     'probabilities via predict_proba method.')

    if not fuat.are_indices_valid(dataset, np.array([feature_index])):
        raise IndexError('Provided feature index is not valid for the input '
                         'dataset.')

    if isinstance(steps_number, int):
        if steps_number < 2:
            raise ValueError('steps_number has to be at least 2.')
    elif steps_number is None:
        pass
    else:
        raise TypeError('steps_number parameter has to either be None or an '
                        'integer.')

    if (not isinstance(treat_as_categorical, bool)
            and treat_as_categorical is not None):
        raise TypeError('treat_as_categorical has to either be None or a '
                        'boolean.')

    is_input_ok = True
    return is_input_ok
def _validate_input_drm(dataset: np.ndarray, data_row: Union[np.ndarray,
                                                             np.void]) -> bool:
    """
    Validates :func:`fatf.utils.data.transformation.dataset_row_masking` input.

    This function checks if ``dataset`` is a 2-dimensional array and if
    ``data_row`` is a 1-dimensional array of the same length as the number of
    columns in the ``dataset``. It also checks if they have valid and
    compatible dtypes.

    For the description of input parameters, and warnings and exceptions raised
    by this function please see the  documentation of the
    :func:`fatf.utils.data.transformation.dataset_row_masking` function.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- text, '
                        'numbers or mixture of the two.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data row must either be a '
                                  '1-dimensional numpy array or a numpy void '
                                  'object for structured rows.')

    # For structured arrays the dtype check also checks the number of columns
    are_similar = fuav.are_similar_dtype_arrays(dataset,
                                                np.array([data_row]),
                                                strict_comparison=False)
    if not are_similar:
        raise TypeError('The dtype of the data row is too different from the '
                        'dtype of the dataset provided.')

    # Since the types agree both, the row and the data set, have to be
    # structured or plane
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data row must contain the same '
                                      'number of elements as the number of '
                                      'columns in the provided dataset.')

    is_valid = True
    return is_valid
Ejemplo n.º 12
0
def _validate_input(dataset: np.ndarray, explain_instance: Callable,
                    sample_size: int, explanations_number: int) -> bool:
    """
    Validates input for submodular pick.

    For the input parameters description, warnings and exceptions please see
    the documentation of the :func:`fatf.transparency.models.submodular_pick`
    function.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError(
            'The input data set must be a 2-dimensional array.')
    if not fuav.is_base_array(dataset):
        raise ValueError('The input data set must only contain base types '
                         '(strings and numbers).')

    if not isinstance(sample_size, int):
        raise TypeError('sample_size must be an integer.')
    if sample_size < 0:
        raise ValueError('sample_size must be a non-negative integer.')

    if not isinstance(explanations_number, int):
        raise TypeError('explanations_number must be an integer.')
    if explanations_number is not None and explanations_number < 0:
        raise ValueError('explanations_number must be a non-negative integer.')

    if (sample_size and explanations_number
            and sample_size < explanations_number):
        raise ValueError('The number of explanations cannot be larger than '
                         'the number of samples.')

    if callable(explain_instance):
        params_n = fuv.get_required_parameters_number(explain_instance)
        if params_n != 1:
            raise RuntimeError('The explain_instance callable must accept '
                               'exactly one required parameter.')
    else:
        raise TypeError('The explain_instance should be a Python callable '
                        '(function or method).')

    is_valid = True
    return is_valid
Ejemplo n.º 13
0
def get_invalid_indices(array: np.ndarray, indices: np.ndarray) -> np.ndarray:
    """
    Returns a numpy array with column indices that the input array is missing.

    Parameters
    ----------
    array : numpy.ndarray
        A 2-dimensional array to be checked.
    indices : numpy.ndarray
        A 1-dimensional array of indices corresponding to columns in the input
        array.

    Raises
    ------
    TypeError
        Either of the input arrays is not a numpy array-like object.
    IncorrectShapeError
        The input array is not 2-dimensional or the indices arrays in not
        1-dimensional.

    Returns
    -------
    invalid_indices : numpy.ndarray
        A **sorted** array of indices that were not found in the input array.
    """
    if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)):
        raise TypeError('Input arrays should be numpy array-like objects.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_1d_array(indices):
        raise IncorrectShapeError('The indices array should be 1-dimensional.')

    if fuav.is_structured_array(array):
        array_indices = set(array.dtype.names)
    else:
        array_indices = set(range(array.shape[1]))

    # Alternatively use numpy's np.isin (which supersedes np.in1d):
    # invalid_indices = indices[np.isin(indices, array_indices, invert=True)]
    # or np.setdiff1d: invalid_indices = np.setdiff1d(indices, array_indices)
    invalid_indices = set(indices.tolist()) - array_indices
    return np.sort(list(invalid_indices))
Ejemplo n.º 14
0
    def _randomise_patch(self, mask: np.ndarray) -> np.ndarray:
        """
        Generates a random colour for each segment selected to be occluded
        by the ``mask``.

        Parameters
        ----------
        mask : numpy.ndarray
            A boolean numpy array of the same shape as ``segments``, indicating
            the pixels (``True``) for which a random colour patch should be
            generated.

        Returns
        -------
        randomise_patch : numpy.ndarray
            A numpy array of (number of pixels to be occluded X number of
            colour channels) dimensions holding random colour patches for the
            segments selected to be occluded.
        """
        assert fuav.is_2d_array(mask), 'Mask must 2-D numpy array.'
        assert mask.shape == self.segments.shape, 'Mask must be segments-like.'
        assert mask.dtype.kind == 'b', 'Mask must be binary.'

        randomise_patch = self.image.copy()
        unique_segments = np.unique(self.segments[mask])
        for id_ in unique_segments:
            segment_mask = (self.segments == id_)
            if self.is_rgb:
                segment_colour = (
                    random.randint(0, 255),
                    random.randint(0, 255),
                    random.randint(0, 255)
                )  # type: SegmentColour # yapf: disable
            else:
                if self.is_bnw:
                    segment_colour = random.choice([0, 255])
                else:
                    segment_colour = random.randint(0, 255)
            randomise_patch[segment_mask] = segment_colour
        randomise_patch = randomise_patch[mask]
        return randomise_patch
Ejemplo n.º 15
0
def are_indices_valid(array: np.array, indices: np.array) -> bool:
    """
    Checks whether all the input ``indices`` are valid for the input ``array``.

    Parameters
    ----------
    array : numpy.array
        The 2-dimensional array to be checked.
    indices : numpy.array
        1-dimensional array of column indices.

    Raises
    ------
    TypeError
        Either of the input arrays is not a numpy array-like object.
    IncorrectShapeError
        The input array is not 2-dimensional or the indices arrays in not
        1-dimensional.

    Returns
    -------
    is_valid : boolean
        A Boolean variable that indicates whether the input column indices are
        valid indices for the input array.
    """
    if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)):
        raise TypeError('Input arrays should be numpy array-like objects.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_1d_array(indices):
        raise IncorrectShapeError('The indices array should be 1-dimensional.')

    invalid_indices = get_invalid_indices(array, indices)
    assert fuav.is_1d_array(invalid_indices), 'This should be a 1-d array.'

    is_valid = not bool(invalid_indices.shape[0])
    return is_valid
Ejemplo n.º 16
0
def _validate_input(dataset: np.ndarray,
                    ground_truth: Optional[np.ndarray] = None,
                    categorical_indices: Optional[List[Index]] = None,
                    int_to_float: bool = True) -> bool:
    """
    Validates the input parameters of an arbitrary augmentation class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be used for sampling.
    ground_truth : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with labels for the supplied dataset.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    int_to_float : boolean, optional (default=True)
        If ``True``, all of the integer dtype columns in the ``dataset`` will
        be generalised to ``numpy.float64`` type. Otherwise, integer type
        columns will remain integer and floating point type columns will remain
        floating point.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array. The
        ``ground_truth`` array is not a 1-dimensional numpy array. The number
        of ground truth annotation is different than the number of rows in the
        data array.
    IndexError
        Some of the column indices given in the ``categorical_indices``
        parameter are not valid for the input ``dataset``.
    TypeError
        The ``categorical_indices`` parameter is neither a list nor ``None``.
        The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not
        of base (numerical and/or string) type. The ``int_to_float`` parameter
        is not a boolean.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if ground_truth is not None:
        if not fuav.is_1d_array(ground_truth):
            raise IncorrectShapeError('The ground_truth array must be '
                                      '1-dimensional. (Or None if it is not '
                                      'required.)')
        if not fuav.is_base_array(ground_truth):
            raise TypeError('The ground_truth array must be of a base type.')
        if ground_truth.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of labels in the '
                                      'ground_truth array is not equal to the '
                                      'number of data points in the dataset '
                                      'array.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if not isinstance(int_to_float, bool):
        raise TypeError('The int_to_float parameter has to be a boolean.')

    is_valid = True
    return is_valid
Ejemplo n.º 17
0
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray,
                  protected_features: List[Index]) -> np.ndarray:
    """
    Checks for systemic bias in a dataset.

    This function checks whether there exist data points that share the same
    unprotected features but differ in protected features. For all of these
    instances their label (ground truth) will be checked and if it is
    different, a particular data points pair will be indicated to be biased.
    This dependency is represented as a boolean, square numpy array that shows
    whether systemic bias exists (``True``) for any pair of data points.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be evaluated for systemic bias.
    ground_truth : numpy.ndarray
        The labels corresponding to the dataset.
    protected_features : List[column index]
        A list of column indices in the dataset that hold protected attributes.

    Raises
    ------
    IncorrectShapeError
        The dataset is not a 2-dimensional numpy array, the ground truth is not
        a 1-dimensional numpy array or the number of rows in the dataset is not
        equal to the number of elements in the ground truth array.
    IndexError
        Some of the column indices given in the ``protected_features`` list are
        not valid for the input dataset.
    TypeError
        The ``protected_features`` parameter is not a list.
    ValueError
        There are duplicate values in the protected feature indices list.

    Returns
    -------
    systemic_bias_matrix : numpy.ndarray
        A square, diagonally symmetrical and boolean numpy array that indicates
        which pair of data point share the same unprotected features but differ
        in protected features and the ground truth annotation.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The dataset should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth should be a 1-dimensional '
                                  'numpy array.')
    if ground_truth.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of rows in the dataset and the '
                                  'ground truth should be equal.')
    if isinstance(protected_features, list):
        pfa = np.asarray(protected_features)
        if not fuat.are_indices_valid(dataset, pfa):
            iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist()
            raise IndexError('The following protected feature indices are not '
                             'valid for the dataset array: {}.'.format(iid))
        if len(set(protected_features)) != len(protected_features):
            raise ValueError('Some of the protected indices are duplicated.')
    else:
        raise TypeError('The protected_features parameter should be a list.')

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        unprotected_features_array = recfn.drop_fields(dataset,
                                                       protected_features)
        if unprotected_features_array is None:
            unprotected_features_array = np.ones((dataset.shape[0], ),
                                                 dtype=[('ones', int)])
    else:
        unprotected_features_array = np.delete(dataset,
                                               protected_features,
                                               axis=1)
        if not unprotected_features_array.size:
            unprotected_features_array = np.ones((dataset.shape[0], 1))

    assert unprotected_features_array.shape[0] == dataset.shape[0], \
        'Must share rows number.'

    systemic_bias_columns = []
    for i in range(unprotected_features_array.shape[0]):
        if is_structured:
            equal_unprotected = (
                unprotected_features_array == unprotected_features_array[i])
        else:
            equal_unprotected = np.apply_along_axis(
                np.array_equal, 1, unprotected_features_array,
                unprotected_features_array[i, :])

        equal_unprotected_indices = np.where(equal_unprotected)

        # Check whether the ground truth is different for these rows
        equal_unprotected[equal_unprotected_indices] = (
            ground_truth[i] != ground_truth[equal_unprotected_indices])
        systemic_bias_columns.append(equal_unprotected)

    systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1)
    assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \
        'The matrix has to be diagonally symmetric.'
    assert not np.diagonal(systemic_bias_matrix).any(), \
        'Same elements cannot be systemically biased.'
    return systemic_bias_matrix
Ejemplo n.º 18
0
def batch_data(data: np.ndarray,
               batch_size: int = 50,
               transformation_fn: Callable = None) -> np.ndarray:
    """
    Slices ``data`` into batches and returns then sequentially.

    .. versionadded:: 0.1.1

    Since some data may be too large to fit into memory as whole,
    this function slices them into batches and yields them sequentially.
    If desired, each batch can be processed by ``transformation_fn``
    prior to returning it.

    Parameters
    ----------
    data : numpy.ndarray
        A two dimensional numpy array (either classic or structured) to be
        sliced into batches.
    batch_size : integer, optional (default=50)
        The size (number of rows) of each batch.
    transformation_fn : callable, optional (default=None)
        A callable object to apply to each batch before returning it.
        It must have exactly one required parameter.

    Raises
    ------
    IncorrectShapeError
        The ``data`` array is not 2-dimensional.
    RuntimeError
        The transformation function does not have exactly one required
        parameter.
    TypeError
        The ``batch_size`` is not an integer or the ``transformation_fn`` is
        not a callable object.
    ValueError
        The ``batch_size`` is smaller than 1.

    Yields
    ------
    slice : numpy.ndarray
        A slice of data.
    """
    if not fuav.is_2d_array(data):
        raise IncorrectShapeError('The data array must be 2-dimensional.')
    if fuav.is_structured_array(data):
        slice_fn = lambda d, a, b: d[a:b]  # noqa: E731
    else:
        slice_fn = lambda d, a, b: d[a:b, :]  # noqa: E731

    if not isinstance(batch_size, int):
        raise TypeError('The batch size must be an integer.')
    if batch_size < 1:
        raise ValueError('The batch size must be larger than 0.')

    if transformation_fn is None:
        transformation_fn = lambda slice: slice  # noqa: E731
    else:
        if not callable(transformation_fn):
            raise TypeError(
                'The transformation function must be a callable object.')
        required_params = fuv.get_required_parameters_number(transformation_fn)
        if required_params != 1:
            raise RuntimeError(
                'The transformation function must have only one required '
                'parameter; now it has {}.'.format(required_params))

    n_rows = data.shape[0]

    def _batch_data():
        for i_start in np.arange(0, n_rows, batch_size):
            i_end = np.min([i_start + batch_size, n_rows])
            data_slice_ = slice_fn(data, i_start, i_end)
            data_slice = transformation_fn(data_slice_)
            yield data_slice

    return _batch_data()
Ejemplo n.º 19
0
def local_fidelity_score(
        dataset: np.ndarray,
        data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Optional[int] = None,
        explained_feature_indices: Optional[List[IndexType]] = None,
        fidelity_radius_percentage: int = 5,
        samples_number: int = 50) -> float:
    """
    Computes local fidelity between a global and a local (surrogate) model.

    .. versionadded:: 0.0.2

    For a selected data point (``data_row``), it samples uniformly around it
    within a hypersphere, which radius corresponds to a percentage -- defined
    with ``fidelity_radius_percentage`` parameter -- of the maximum l-2
    distance between the specified data point and all the instances in the
    ``dataset``. (This sampling is based on
    :class:`fatf.utils.data.augmentation.LocalSphere` data augmenter.)

    .. warning:: A ``dataset`` with categorical features.

       This surrogate evaluation metric should **not** be used when the
       ``dataset`` contains *categorical features* (even when they are encoded,
       e.g., one-hot encoding) since the l-2 distance computed on mixed true
       numerical and (numerically-encoded) categorical features causes the
       local sample (computed with the
       :class:`fatf.utils.data.augmentation.LocalSphere` data augmenter) to
       be ill-defined. Feature scaling could possibly be used to overcome this
       issue, however we leave such consideration up to the user.

    The global and local predictive functions can be either: a probabilistic
    predictor, a (multi-class) classifier or a regressor.

    +-------+---------------------------------------------------------------+
    |       | Global Model                                                  |
    +-------+--------+-------------------------+--------------------+-------+
    | Local |        | |prob|                  | |clf|              | |reg| |
    | Model +--------+-------------------------+--------------------+-------+
    |       | |prob| | OK, e.g., KL-divergence | OK, e.g., log-loss | |imp| |
    |       +--------+-------------------------+--------------------+-------+
    |       | |clf|  | OK (via thresholding)   | OK                 | |imp| |
    |       +--------+-------------------------+--------------------+-------+
    |       | |reg|  | OK for a single class   | |imp|              | OK    |
    +-------+--------+-------------------------+--------------------+-------+

    .. |prob| replace:: **probabilistic**
    .. |clf| replace:: **classifier**
    .. |reg| replace:: **regressor**
    .. |imp| replace:: Not possible

    If the ``global_predictive_function`` outputs **probabilities**, the
    following should be considered for different types of a local model:

    * The local model is **probabilistic** as well:

      + a native probabilistic evaluation metric, such as the
        `Kullback–Leibler divergence`_, can be used; or
      + a thresholding can be applied or a top prediction can be chosen for
        both the local and the global probabilistic prediction and a classic
        classification performance metric can be used.

    * The local model is a **classifier** -- the probabilistic output of the
      global model has to be thresholded or the top prediction needs to be
      selected and a classic classification performance metric can be used.
    * The local model is a **regressor** -- this is only possible if the
      regressor is fitted for the probabilistic output of one of the classes.
      In this case any of the standard regression evaluation measures can be
      used.

    If the ``global_predictive_function`` is a **classifier**, the
    following should be considered for different types of a local model:

    * The local model is **probabilistic**:

      + a native performance metric, like log-loss_, can be used; or
      + the probabilistic output of the local predictor can be thresholded or
        the top label selected and compare using standard classification
        performance metrics.

    * The local model is a **classifier** as well -- any standard (multi-class)
      classification performance metric can be used.
    * Having a local **regressor** is not possible in this case.

    Finally, if the ``global_predictive_function`` is a **regressor**, the
    local model can **only** be a regressor as well, in which case any standard
    regression evaluation metric can be used.

    If the problem being modelled is multi-class (for probabilistic models and
    classifiers), the local model can either be fitted to the original
    multi-class problem or as one-vs-the-rest for a selected class. In the
    latter case, when the global model is probabilistic, the
    ``explained_class_index`` parameter may be used to specify the class
    (column index) that the ``data_row`` belongs to (according to the global
    model) -- in this case only the selected column with probabilities will be
    passed to the local fidelity score (``metric_function``) function.

    .. note:: Why to train the local model as one-vs-the-rest?

       When the local model is trained in the same output domain as the global
       model, the explanations extracted from this local model apply to all of
       the possible classes, what for some types of local models renders them
       uninformative. For example, consider training a decision tree locally
       and using the feature importance it provides. In this case we know
       which features are important in this local space but we cannot attribute
       these importances to any of the possible classes. However, a different
       type of explanation extracted from the same tree, for example, the
       logical conditions extracted from a root-to-leaf path that the selected
       ``data_row`` falls into, can be perfectly reasonable.

       If, on the other hand, the local model is trained as one-vs-the rest,
       where the "one" class is often set to be the class of the selected
       ``data_row``, any type of the explanation can be attributed to the
       selected class. In this case feature importances extracted from the
       local model can attributed to the selected class in the specified
       neighbourhood. This mode of training the local model is required when
       the global model is probabilistic and the local one is a regressor, and
       optional for all the other combinations of the two.

       The consequence of training the local model as one-vs-the-rest is the
       need for train a separate local model for every class desired to be
       explained. For some local models and explanation types this is a
       requirement. For example, when the local model is a linear regression
       (trained on probabilities of a selected class) the only possible
       explanation is feature importance, which is meaningless in other cases.

       In general, when evaluating the quality of a local surrogate, the most
       truthful measure would be the one achieved when the local model is
       trained on the same set of target classes. A good quality of a local
       one-vs-the-rest model with respect to the global model should be
       treated with caution as it only indicates that the local model excels
       at this task and may not be a good approximation of the global decisive
       process at all. Comparing quality of two local models where one is
       multi-class and the other one-vs-the-rest is relatively complex and
       should be done with caution (the former local model has a more difficult
       task to solve).

    Examples of how to define the ``metric_function`` can be found in the
    *Examples* section down below. This local fidelity evaluation is inspired
    by the local fidelity method introduced in [LAUGEL2018SPHERES]_.

    .. _`Kullback–Leibler divergence`: https://en.wikipedia.org/wiki/
       Kullback–Leibler_divergence
    .. _log-loss: https://scikit-learn.org/stable/modules/
       model_evaluation.html#log-loss
    .. [LAUGEL2018SPHERES] Laugel, T., Renard, X., Lesot, M. J., Marsala,
       C., & Detyniecki, M. (2018). Defining locality for surrogates in
       post-hoc interpretablity. Workshop on Human Interpretability for
       Machine Learning (WHI) -- International Conference on Machine Learning,
       2018.

    Examples
    --------
    The metric function should be adjusted to the type of the global and local
    predictors (and the use of the ``explained_class_index`` parameter).

    >>> import numpy as np
    >>> data = np.array([[0, 1], [1, 1], [1, 0]])
    >>> targets = np.array(['a', 'b', 'c'])

    Let us assume that the global model is probabilistic, the local model is
    a regressor and we are explaining class ``'b'`` with index ``1``.
    (The index of the class is based on the lexicographical ordering of all the
    unique target values.)

    >>> explained_class_index = 1

    >>> import fatf.utils.models.models as fatf_models
    >>> global_model = fatf_models.KNN(k=1)
    >>> global_model.fit(data, targets)

    >>> probabilities = global_model.predict_proba(data)
    >>> selected_class_probabilities = probabilities[:, explained_class_index]

    >>> local_model = fatf_models.KNN(k=1, mode='regressor')
    >>> local_model.fit(data, selected_class_probabilities)

    One way to evaluate the performance of our local (surrogate) model in this
    scenario is the *Mean Squared Error*:

    >>> def mse(global_predictions, local_predictions):
    ...     mse = np.square(global_predictions - local_predictions)
    ...     mse = mse.mean()
    ...     return mse

    >>> import fatf.utils.transparency.surrogate_evaluation as surrogate_eval
    >>> mse_fidelity_score = surrogate_eval.local_fidelity_score(
    ...     data, data[0], global_model.predict_proba, local_model.predict,
    ...     mse, explained_class_index=explained_class_index)
    >>> mse_fidelity_score
    0.0

    Alternatively, if ``scikit-learn`` is available, an ROC can be computed,
    in which case the probabilities of the selected class need to be
    thresholded:

    >>> import sklearn.metrics
    >>> def roc(global_predictions, local_predictions):
    ...     global_predictions[global_predictions >= .5] = 1
    ...     global_predictions[global_predictions < .5] = 0
    ...     global_predictions = global_predictions.astype(int)
    ...
    ...     roc = sklearn.metrics.roc_auc_score(global_predictions,
    ...                                         local_predictions)
    ...     return roc

    >>> roc_fidelity_score = surrogate_eval.local_fidelity_score(
    ...     data, data[1], global_model.predict_proba, local_model.predict,
    ...     roc, explained_class_index=explained_class_index)
    >>> roc_fidelity_score
    1.0

    If both models are classifiers trained with the same set of target classes,

    >>> local_classifier = fatf_models.KNN(k=1)
    >>> local_classifier.fit(data, targets)

    a simple *accuracy* (implemented in FAT Forensics) can be used:

    >>> import fatf.utils.metrics.metrics as fatf_metrics
    >>> import fatf.utils.metrics.tools as fatf_metrics_tools
    >>> def accuracy(global_predictions, local_predictions):
    ...     confusion_matrix = fatf_metrics_tools.get_confusion_matrix(
    ...         local_predictions, global_predictions, labels=['a', 'b', 'c'])
    ...     accuracy = fatf_metrics.accuracy(confusion_matrix)
    ...     return accuracy

    >>> accuracy_fidelity_score = surrogate_eval.local_fidelity_score(
    ...     data, data[2], global_model.predict, local_classifier.predict,
    ...     accuracy)
    >>> accuracy_fidelity_score
    1.0

    (Note ``global_model.predict`` instead of ```global_model.predict_proba``.)

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset used to initialise the data
        sampler.
    data_row : Union[numpy.ndarray, numpy.void]
        A data point around which local fidelity is evaluated.
    global_predictive_function : Callable[[np.ndarray], np.ndarray]
        A Python callable (e.g., a function) that is responsible for predicting
        data points in the global model. This function can either be
        *probabilistic*, i.e., return a 2-dimensional numpy array with
        probabilities for every possible target class; a *regressor* (returning
        a 1-dimensional regression values array) or a *classifier* (returning a
        1-dimensional class prediction array). Regardless of the type it
        **must** allow only **one required parameter** -- a 2-dimensional data
        array to be predicted.
    local_predictive_function : Callable[[np.ndarray], np.ndarray]
        A Python callable (e.g., a function) that is responsible for predicting
        data points in the local (surrogate) model. For more details about the
        allowed function types please see the description of the
        ``global_predictive_function`` parameter.
    metric_function : Callable[[numpy.ndarray, numpy.ndarray], float]
        A Python callable (e.g., a function) that computes a (performance)
        metric between the predictions of the global model
        (``global_predictive_function``) and the predictions of the local
        (surrogate) model (``local_predictive_function``). The passed callable
        object **must** take exactly **two required parameters**: the first one
        being predictions of the global model and the latter predictions of the
        local model, and return a number (float) representing performance
        comparison of the two. This callable object has to be adjusted to the
        types of global and local predictive functions.
    explained_class_index : integer, optional (default=None)
        If the global model (``global_predictive_function``) is probabilistic,
        this parameter allows to select a single column of probabilities for a
        selected class to be passed to the ``metric_function``. This parameter
        is useful when the local (surrogate) model is a regressor predicting
        probabilities of this chosen class (the class being explained).
    explained_feature_indices : List[IndexType], optional (default=None)
        If the local (surrogate) model was trained on a subset of the features,
        this parameter allows to indicate which features should be used when
        predicting the generated data with the local model. If ``None``, all of
        the features will be used.
    fidelity_radius_percentage : integer, optional (default=5)
        The locality of the fidelity measure is enforced by limiting the
        distance from the selected ``data_row`` to generated data, which is
        used for fidelity metric evaluation. This radius (of a hyper-sphere
        around the selected ``data_row``) is defined as a percentage of the
        largest l-2 distance between any two data points in the input
        ``dataset`` within which the evaluation data will be sampled.
    samples_number : integer, optional (default=50)
        The number of samples to be generated when computing the local fidelity
        score.

    Warns
    -----
    UserWarning
        If the user specifies the ``explained_class_index`` parameter for a
        global model that is not probabilistic, this parameter is ignored,
        about what the user is warned.

    Raises
    ------
    IncompatibleModelError
        The ``global_predictive_function`` or the ``local_predictive_function``
        does not required **exactly one** parameter.
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array. The input
        ``data_row`` is not 1-dimensional: either a 1-dimensional numpy array
        or a numpy void object for structured rows. The number of columns
        (features) in the ``data_row`` is different to the number of columns in
        the input ``dataset``.
    IndexError
        Some of the ``explained_feature_indices`` are invalid column indices
        for the input ``dataset``.
    TypeError
        The input ``dataset`` is not of a base type. The dtype of the
        ``data_row`` is too different from the dtype of the ``dataset``.
        The ``global_predictive_function`` or the ``local_predictive_function``
        is not a Python callable. The ``metric_function`` is not a Python
        callable or it does not require **exactly** two parameters.
        The ``explained_class_index`` is neither ``None`` nor an integer.
        The ``explained_feature_indices`` is neither ``None`` nor a Python
        list. The ``fidelity_radius_percentage`` is not an integer. The
        ``samples_number`` is not an integer.
    ValueError
        The ``explained_class_index`` is a negative integer or out of bounds
        for the number of classes output by the global probabilistic model
        (``global_predictive_function``). The ``fidelity_radius_percentage``
        is smaller than 1 or larger than 100. The ``samples_number`` is smaller
        than 1.

    Returns
    -------
    fidelity_score : float
        A metric of "closeness" between the global and the local predictive
        function predictions calculated using the ``metric_function`` on the
        sampled data.
    """
    # pylint: disable=too-many-arguments
    assert _validate_input_local_fidelity(
        dataset, data_row, global_predictive_function,
        local_predictive_function, metric_function, explained_class_index,
        explained_feature_indices, fidelity_radius_percentage,
        samples_number), 'Input is invalid.'

    augmentor = fuda.LocalSphere(dataset, int_to_float=False)
    sampled_data = augmentor.sample(data_row, fidelity_radius_percentage,
                                    samples_number)

    global_predictions = global_predictive_function(sampled_data)
    assert not fuav.is_structured_array(global_predictions), 'Is structured.'
    if explained_class_index is not None:
        assert fuav.is_2d_array(global_predictions), '2-D probabilities array.'
        global_predictions = global_predictions[:, explained_class_index]

    if explained_feature_indices is None:
        local_data = sampled_data
    else:
        if fuav.is_structured_array(sampled_data):
            local_data = sampled_data[explained_feature_indices]
        else:
            local_data = sampled_data[:, explained_feature_indices]
    local_predictions = local_predictive_function(local_data)

    fidelity_score = metric_function(global_predictions, local_predictions)

    return fidelity_score
Ejemplo n.º 20
0
def counterfactual_fairness_check(unfair_counterfactuals: Optional[
    np.ndarray] = None,
                                  distances: Optional[np.ndarray] = None,
                                  threshold: Optional[float] = None) -> bool:
    """
    Checks for counterfactual fairness using a counterfactual fairness arrays.

    There are two different approaches to evaluate counterfactual fairness. The
    first one is to take the ``distances`` to the counterfactual examples and
    see whether there are any that exceed a certain ``threshold`` in which case
    a given instance is considered to be treated unfairly. Alternatively by
    using the ``unfair_counterfactuals`` array this function checks whether
    there are any unfair counterfactual instances. In case all the input
    parameters are given **the distance-based approach takes the precedence**.

    Parameters
    ----------
    unfair_counterfactuals : numpy.ndarray, optional (default=None)
        A 2-dimensional numpy array with counterfactual examples that expose
        unfairness of a prediction.
    distances : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with .
    threshold : number, optional (default=None)
        A numerical threshold above which a counterfactual instance is too far,
        therefore it is considered to be an exemplar of individual unfairness.

    Raises
    ------
    IncorrectShapeError
        The ``unfair_counterfactuals`` parameter is not a 2-dimensional array.
        The ``distances`` parameter is not a 1-dimensional array.
    RuntimeError
        Either of the required input parameters were not given:
        ``unfair_counterfactuals`` or ``distances`` and ``threshold``.
    TypeError
        The ``threshold`` parameter is not a number.
    ValueError
        The ``distances`` array is not purely numerical.

    Returns
    -------
    counterfactually_unfair : boolean
        ``True`` if there are any counterfactually unfair instances, ``False``
        otherwise.
    """
    if distances is not None and threshold is not None:
        if not fuav.is_1d_array(distances):
            raise IncorrectShapeError('The distances parameter has to be a '
                                      '1-dimensional array.')
        if not fuav.is_numerical_array(distances):
            raise ValueError('The distances array has to be purely numerical.')
        if not isinstance(threshold, Number):
            raise TypeError('The threshold parameter has to be a number.')

        counterfactually_unfair = (distances > threshold).any()
    elif unfair_counterfactuals is not None:
        if not fuav.is_2d_array(unfair_counterfactuals):
            raise IncorrectShapeError('The unfair counterfactuals parameter '
                                      'has to be a 2-dimensional numpy array.')
        counterfactually_unfair = bool(unfair_counterfactuals.size)
    else:
        raise RuntimeError('Either of the two is required to run this '
                           'function: unfair_counterfactuals parameter or '
                           'both distances and threshold parameters.')

    return counterfactually_unfair
Ejemplo n.º 21
0
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Fits the model.

        Parameters
        ----------
        X : numpy.ndarray
            The KNN training data.
        y : numpy.ndarray
            The KNN training labels.

        Raises
        ------
        IncorrectShapeError
            Either the ``X`` array is not 2-dimensional, the ``y`` array is not
            1-dimensional, the number of rows in ``X`` is not the same as the
            number of elements in ``y`` or the ``X`` array has 0 rows or 0
            columns.
        PrefittedModelError
            Trying to fit the model when it has already been fitted. Usually
            raised when calling the ``fit`` method for the second time without
            clearing the model first.
        TypeError
            Trying to fit a KNN predictor in a regressor mode with
            non-numerical target variable.
        """
        if self._is_fitted:
            raise PrefittedModelError('This model has already been fitted.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('The training data must be a 2-'
                                      'dimensional array.')
        if not fuav.is_1d_array(y):
            raise IncorrectShapeError('The training data labels must be a 1-'
                                      'dimensional array.')
        if X.shape[0] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one data point.')
        # If the array is structured the fuav.is_2d_array function takes care
        # of checking whether there is at least one column
        if not fuav.is_structured_array(X) and X.shape[1] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one feature.')
        if X.shape[0] != y.shape[0]:
            raise IncorrectShapeError('The number of samples in X must be the '
                                      'same as the number of labels in y.')
        if not self._is_classifier and not fuav.is_numerical_array(y):
            raise TypeError('Regressor can only be fitted for a numerical '
                            'target vector.')

        numerical_indices, categorical_indices = fuat.indices_by_type(X)
        self._numerical_indices = numerical_indices
        self._categorical_indices = categorical_indices

        self._is_structured = fuav.is_structured_array(X)
        self._X = X
        self._y = y

        if self._is_classifier:
            unique_y, unique_y_counts = np.unique(self._y, return_counts=True)
            # Order labels lexicographically.
            unique_y_sort_index = np.argsort(unique_y)
            self._unique_y = unique_y[unique_y_sort_index]
            self._unique_y_counts = unique_y_counts[unique_y_sort_index]

            # How many other labels have the same count.
            top_y_index = self._unique_y_counts == np.max(
                self._unique_y_counts)
            top_y_unique_sorted = np.sort(self._unique_y[top_y_index])
            self._majority_label = top_y_unique_sorted[0]

            self._unique_y_probabilities = (self._unique_y_counts /
                                            self._y.shape[0])
        else:
            self._majority_label = self._y.mean()
            self._unique_y = np.ndarray((0, ))
            self._unique_y_counts = np.ndarray((0, ))
            self._unique_y_probabilities = np.ndarray((0, ))

        self._X_n = self._X.shape[0]
        self._is_fitted = True
Ejemplo n.º 22
0
def indices_by_type(array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Identifies indices of columns with numerical and non-numerical values.

    Checks whether a numpy array is purely numerical or a structured array
    and returns two numpy arrays: the first-one with indices of numerical
    columns and the second-one with indices of non-numerical columns.

    Parameters
    ----------
    array : numpy.ndarray
        A numpy array to be checked (it has to be a 2-dimensional array).

    Raises
    ------
    TypeError
        The input array is not a numpy array-like object.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.
    IncorrectShapeError
        The input array is not 2-dimensional.

    Returns
    -------
    numerical_indices : numpy.ndarray
        A numpy array containing indices of the numerical columns of the input
        array.
    non_numerical_indices : numpy.ndarray
        A numpy array containing indices of the non-numerical columns of the
        input array.
    """
    if not isinstance(array, np.ndarray):
        raise TypeError('The input should be a numpy array-like.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_base_array(array):
        raise ValueError('indices_by_type only supports input arrays that '
                         'hold base numpy types, i.e. numerical and '
                         'string-like -- numpy void and object-like types are '
                         'not allowed.')

    if fuav.is_structured_array(array):
        assert len(array.dtype) > 1, 'This should be a 2D array.'
        numerical_indices_list = []
        non_numerical_indices_list = []

        for column_name in array.dtype.names:
            column_dtype = array.dtype[column_name]
            if fuav.is_numerical_dtype(column_dtype):
                numerical_indices_list.append(column_name)
            else:
                non_numerical_indices_list.append(column_name)

        numerical_indices = np.array(numerical_indices_list)
        non_numerical_indices = np.array(non_numerical_indices_list)
    else:
        if fuav.is_numerical_array(array):
            numerical_indices = np.array(range(array.shape[1]))
            non_numerical_indices = np.empty((0, ), dtype='i8')
        else:
            numerical_indices = np.empty((0, ), dtype='i8')
            non_numerical_indices = np.array(range(array.shape[1]))

    return numerical_indices, non_numerical_indices
Ejemplo n.º 23
0
def _validate_input_local_fidelity(
        dataset: np.ndarray, data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Union[int, None],
        explained_feature_indices: Union[List[IndexType], None],
        fidelity_radius_percentage: int, samples_number: int) -> bool:
    """
    Validates the input parameters for the ``local_fidelity_score`` function.

    This function validates input parameter of the
    :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score`
    function. The description of this function's input parameters, errors and
    exceptions can be found therein.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- numbers '
                        'and/or strings.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured data rows.')

    are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]))
    if not are_similar:
        raise TypeError('The dtype of the data_row is too different from '
                        'the dtype of the dataset array.')

    # If the dataset is structured and the data_row has a different
    # number of features this will be caught by the above dtype check.
    # For classic numpy arrays this has to be done separately.
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data_row must contain the same '
                                      'number of features as the dataset.')

    if callable(global_predictive_function):
        global_params_n = fuv.get_required_parameters_number(
            global_predictive_function)
        if global_params_n != 1:
            raise IncompatibleModelError(
                'The global predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The global_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(local_predictive_function):
        local_params_n = fuv.get_required_parameters_number(
            local_predictive_function)
        if local_params_n != 1:
            raise IncompatibleModelError(
                'The local predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The local_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(metric_function):
        if fuv.get_required_parameters_number(metric_function) != 2:
            raise TypeError('The metric_function must take exactly *two* '
                            'required parameters.')
    else:
        raise TypeError('The metric_function should be a Python callable, '
                        'e.g., a Python function.')

    # Explained class index
    global_prediction = global_predictive_function(dataset[:1])
    assert not fuav.is_structured_array(global_prediction), 'Must be plain.'
    assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.'
    if fuav.is_2d_array(global_prediction):  # A probabilistic model.
        if explained_class_index is not None:
            if isinstance(explained_class_index, int):
                if (explained_class_index >= global_prediction.shape[1]
                        or explained_class_index < 0):
                    raise ValueError('The explained_class_index parameter is '
                                     'negative or larger than the number of '
                                     'classes output by the global '
                                     'probabilistic model.')
            else:
                raise TypeError('For probabilistic global models, i.e., '
                                'global predictive functions, the '
                                'explained_class_index parameter has to be an '
                                'integer or None.')
    elif fuav.is_1d_array(global_prediction):
        if explained_class_index is not None:
            warnings.warn(
                'The explained_class_index parameter is not None and will be '
                'ignored since the global model is not probabilistic.',
                UserWarning)
    else:
        assert False, ('Global predictor must output a 1- or 2-dimensional '
                       'numpy array.')  # pragma: nocover

    if explained_feature_indices is not None:
        if isinstance(explained_feature_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(explained_feature_indices))
            if invalid_indices.size:
                raise IndexError(
                    'The following column indices are invalid for the input '
                    'dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The explained_feature_indices parameter must be '
                            'a Python list or None.')

    if isinstance(fidelity_radius_percentage, int):
        if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100:
            raise ValueError('The fidelity_radius_percentage must be an '
                             'integer between 1 and 100.')
    else:
        raise TypeError('The fidelity_radius_percentage must be an integer '
                        'between 1 and 100.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number must be a positive integer.')
    else:
        raise TypeError('The samples_number must be an integer.')

    is_input_ok = True
    return is_input_ok
Ejemplo n.º 24
0
    def _get_distances(self, X: np.ndarray) -> np.ndarray:
        """
        Gets distances for a mixture of numerical and categorical features.

        For numerical columns the distance is calculated as the Euclidean
        distance. For categorical columns (i.e. non-numerical, e.g. strings)
        the distance is 0 when the value matches and 1 otherwise.

        Parameters
        ----------
        X : numpy.ndarray
            A data array for which distances to the training data will be
            calculated.

        Raises
        ------
        AssertionError
            Raised when the model is not fitted, X is not a 2-dimensional
            array or X's dtype is different than training data's dtype. It is
            also raised when the distances matrix is not 2-dimensional.

        Returns
        -------
        distances : numpy.ndarray
            An array of distances between X and the training data.
        """
        # pylint: disable=invalid-name
        assert self._is_fitted, 'Cannot calculate distances on unfitted model.'
        assert fuav.is_2d_array(X), 'X must be a 2-dimensional array.'
        assert fuav.are_similar_dtype_arrays(X, self._X), \
            'X must have the same dtype as the training data.'

        distances_shape = (self._X.shape[0], X.shape[0])
        categorical_distances = np.zeros(distances_shape)
        numerical_distances = np.zeros(distances_shape)

        if self._is_structured:
            if self._categorical_indices.size:
                categorical_distances = fud.binary_array_distance(
                    self._X[self._categorical_indices],
                    X[self._categorical_indices])
            if self._numerical_indices.size:
                numerical_distances = fud.euclidean_array_distance(
                    self._X[self._numerical_indices],
                    X[self._numerical_indices])
        else:
            if self._categorical_indices.size:
                categorical_distances = fud.binary_array_distance(
                    self._X[:, self._categorical_indices],
                    X[:, self._categorical_indices])
            if self._numerical_indices.size:
                numerical_distances = fud.euclidean_array_distance(
                    self._X[:, self._numerical_indices],
                    X[:, self._numerical_indices])

        assert categorical_distances.shape == numerical_distances.shape, \
            'Different number of point-wise distances for these feature types.'
        distances = categorical_distances + numerical_distances
        assert fuav.is_2d_array(distances), 'Distances matrix must be 2D.'

        return distances
Ejemplo n.º 25
0
def _validate_input_dc(data_set: np.ndarray,
                       categorical_indices: Union[None, List[Index]],
                       neighbours: int,
                       distance_function: Union[None, DistanceFunction],
                       normalise_scores: bool) -> bool:
    """
    Validates ``DensityCheck`` class initialiser's input parameters.

    Parameters
    ----------
    data_set : numpy.ndarray
        A 2-dimensional numpy array (either classic or structured) of a base
        type.
    categorical_indices : Union[None, List[column index]],
        Either ``None`` or a list of column indices to be treated as
        categorical.
    neighbours : integer
        The number of closest neighbours to be considered.
    distance_function : Union[None, Callable[[data row, data row], number]]
        Either ``None`` or a Python function that calculates a distance between
        two data points. This function takes as an input two 1-dimensional
        numpy arrays (for classic numpy arrays) or numpy voids (fro structured
        numpy arrays) of equal length and outputs a number representing a
        distance between them. **The distance function is assumed to return the
        same distance regardless of the order in which the input parameters are
        given.**
    normalise_scores : boolean
        A boolean parameter indicating whether to normalise the scores
        (``True``) or not (``False``).

    Raises
    ------
    AttributeError
        The distance function does not require exactly 2 non-optional
        parameters.
    IncorrectShapeError
        The ``data_set`` array is not 2-dimensional.
    IndexError
        Some of the provided categorical column indices are invalid for the
        ``data_set`` array.
    TypeError
        The ``data_set`` array is not of a base type (strings and/or numbers).
        The ``neighbours`` parameter is not an integer. The
        ``distance_function`` is neither ``None`` nor Python callable (a
        function). The ``normalise_scores`` parameter is not a boolean. The
        ``categorical_indices`` parameter is not a Python list.
    ValueError
        The ``neighbours`` parameter is smaller than 1 or larger than the
        number of instances (rows) in the ``data_set`` array.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(data_set):
        raise IncorrectShapeError('The data set should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(data_set):
        raise TypeError('The data set is not of a base type (numbers and/or '
                        'strings.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                data_set, np.asarray(categorical_indices)).tolist()
            if invalid_indices:
                raise IndexError('The following indices are invalid for the '
                                 'input data set: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if isinstance(neighbours, int):
        if neighbours < 1 or neighbours > data_set.shape[0]:
            raise ValueError('The neighbours number parameter has to be '
                             'between 1 and number of data points (rows) in '
                             'the data set array.')
    else:
        raise TypeError('The neighbours number parameter has to be an '
                        'integer.')

    if distance_function is not None:
        if callable(distance_function):
            required_param_n = 0
            params = inspect.signature(distance_function).parameters
            for param in params:
                if params[param].default is params[param].empty:
                    required_param_n += 1
            if required_param_n != 2:
                raise AttributeError('The distance function must require '
                                     'exactly 2 parameters. Given function '
                                     'requires {} '
                                     'parameters.'.format(required_param_n))
        else:
            raise TypeError('The distance function should be a Python '
                            '(function).')

    if not isinstance(normalise_scores, bool):
        raise TypeError('The normalise scores parameter should be a boolean.')

    is_valid = True
    return is_valid
Ejemplo n.º 26
0
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predicts labels of new instances with the fitted model.

        Parameters
        ----------
        X : numpy.ndarray
            The data for which labels will be predicted.

        Raises
        ------
        IncorrectShapeError
            X is not a 2-dimensional array, it has 0 rows or it has a different
            number of columns than the training data.
        UnfittedModelError
            Raised when trying to predict data when the model has not been
            fitted yet. Try using the ``fit`` method to fit the model first.
        ValueError
            X has a different dtype than the data used to fit the model.

        Returns
        -------
        predictions : numpy.ndarray
            Predicted class labels for each data point.
        """
        # pylint: disable=too-many-locals,too-many-branches
        if not self._is_fitted:
            raise UnfittedModelError('This model has not been fitted yet.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('X must be a 2-dimensional array. If '
                                      'you want to predict a single data '
                                      'point please format it as a single row '
                                      'in a 2-dimensional array.')
        if not fuav.are_similar_dtype_arrays(X, self._X):
            raise ValueError('X must have the same dtype as the training '
                             'data.')
        if not X.shape[0]:
            raise IncorrectShapeError('X must have at least one row.')
        # No need to check for columns in a structured array -> this is handled
        # by the dtype checker.
        if not fuav.is_structured_array(X):
            if X.shape[1] != self._X.shape[1]:
                raise IncorrectShapeError(('X must have the same number of '
                                           'columns as the training data '
                                           '({}).').format(self._X.shape[1]))

        predictions = np.empty((X.shape[0], ))

        if self._k < self._X_n:
            distances = self._get_distances(X)
            # If there are 3 nearest neighbours within distances 1, 2 and 2 and
            # k is set to 2, then argpartition will always take the first
            # within distance 2.
            knn = np.argpartition(distances, self._k, axis=0)
            predictions = []
            for column in knn.T:
                close_labels = self._y[column[:self._k]]
                if self._is_classifier:
                    values, counts = np.unique(close_labels,
                                               return_counts=True)
                    # If there is a tie in the counts take into consideration
                    # the overall label count in the training data to resolve
                    # it.
                    top_label_index = counts == counts.max()
                    top_label_unique_sorted = np.sort(values[top_label_index])
                    assert len(top_label_unique_sorted.shape) == 1, \
                        'This should be a flat array.'
                    if top_label_unique_sorted.shape[0] > 1:
                        # Resolve the tie.
                        # Get count of these label for the training data.
                        labels_filter = np.array(self._unique_y.shape[0] *
                                                 [False])
                        for top_prediction in top_label_unique_sorted:
                            unique_y_filter = self._unique_y == top_prediction
                            np.logical_or(labels_filter,
                                          unique_y_filter,
                                          out=labels_filter)
                        g_top_label = self._unique_y[labels_filter]
                        g_top_label_counts = (
                            self._unique_y_counts[labels_filter])

                        # What if any of the global labels have the same count?
                        g_top_label_index = g_top_label_counts == np.max(
                            g_top_label_counts)
                        g_top_label_sorted = np.sort(
                            g_top_label[g_top_label_index])

                        prediction = g_top_label_sorted[0]
                    else:
                        prediction = top_label_unique_sorted[0]
                else:
                    prediction = close_labels.mean()

                predictions.append(prediction)
            predictions = np.array(predictions)
        else:
            predictions = np.array(X.shape[0] * [self._majority_label])

        return predictions
Ejemplo n.º 27
0
def validate_confusion_matrix(confusion_matrix: np.ndarray,
                              label_index: Optional[int] = None) -> bool:
    """
    Validates a confusion matrix.

    This function checks whether the ``confusion_matrix`` is 2-dimensional,
    square, unstructured and of integer kind.

    If the ``label_index`` parameter is given, it is checked to be a valid
    index for the given confusion matrix.

    Parameters
    ----------
    confusion_matrix : numpy.ndarray
        A confusion matrix to be validated.
    label_index : integer, optional (default=None)
        An index which validity will be checked for the confusion matrix (if
        not ``None``).

    Raises
    ------
    IncorrectShapeError
        The confusion matrix is not a 2-dimensional numpy array, it is not
        square (equal width and height) or its dimension is not at least 2x2.
    IndexError
        The ``label_index`` (if given) is not valid for the confusion matrix.
    TypeError
        The confusion matrix is not of an integer kind (e.g. ``int``,
        ``numpy.int32``, ``numpy.int64``). The ``label_index`` is not an
        integer.
    ValueError
        The confusion matrix is a structured numpy array.

    Returns
    -------
    is_valid : boolean
        ``True`` if the confusion matrix is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(confusion_matrix):
        raise IncorrectShapeError('The confusion matrix has to be a '
                                  '2-dimensional numpy array.')
    if fuav.is_structured_array(confusion_matrix):
        raise ValueError('The confusion matrix cannot be a structured numpy '
                         'array.')
    if confusion_matrix.shape[0] != confusion_matrix.shape[1]:
        raise IncorrectShapeError('The confusion matrix has to be a square '
                                  '(equal width and height) numpy array.')
    if confusion_matrix.shape[1] < 2:
        raise IncorrectShapeError('The confusion matrix needs to be at least '
                                  '2x2.')
    if confusion_matrix.dtype.kind != 'i':
        raise TypeError('The confusion matrix has to be of integer kind.')

    if label_index is not None:
        if not isinstance(label_index, int):
            raise TypeError('The label index has to be an integer.')
        if label_index < 0 or label_index >= confusion_matrix.shape[0]:
            msg = ('The label index {} is not a valid index for the confusion '
                   'matrix of shape {}x{}.')
            msg = msg.format(label_index, confusion_matrix.shape[0],
                             confusion_matrix.shape[1])
            raise IndexError(msg)

    is_valid = True
    return is_valid
Ejemplo n.º 28
0
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Calculates label probabilities for new instances with the fitted model.

        Parameters
        ----------
        X : numpy.ndarray
            The data for which labels probabilities will be predicted.

        Raises
        ------
        IncorrectShapeError
            X is not a 2-dimensional array, it has 0 rows or it has a different
            number of columns than the training data.
        UnfittedModelError
            Raised when trying to predict data when the model has not been
            fitted yet. Try using the ``fit`` method to fit the model first.
        RuntimeError
            Raised when trying to use this method when the predictor is
            initialised as a regressor.
        ValueError
            X has a different dtype than the data used to fit the model.

        Returns
        -------
        probabilities : numpy.ndarray
            Probabilities of each instance belonging to every class. The labels
            in the return array are ordered by lexicographic order.
        """
        if not self._is_classifier:
            raise RuntimeError('This functionality is not available for a '
                               'regressor.')

        if not self._is_fitted:
            raise UnfittedModelError('This model has not been fitted yet.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('X must be a 2-dimensional array. If '
                                      'you want to predict a single data '
                                      'point please format it as a single row '
                                      'in a 2-dimensional array.')
        if not fuav.are_similar_dtype_arrays(X, self._X):
            raise ValueError('X must have the same dtype as the training '
                             'data.')
        if not X.shape[0]:
            raise IncorrectShapeError('X must have at least one row.')
        # No need to check for columns in a structured array -> this is handled
        # by the dtype checker.
        if not fuav.is_structured_array(X):
            if X.shape[1] != self._X.shape[1]:
                raise IncorrectShapeError(('X must have the same number of '
                                           'columns as the training data '
                                           '({}).').format(self._X.shape[1]))

        probabilities = np.empty((X.shape[0], self._unique_y.shape[0]))

        if self._k < self._X_n:
            distances = self._get_distances(X)
            knn = np.argpartition(distances, self._k, axis=0)
            probabilities = []
            for column in knn.T:
                close_labels = self._y[column[:self._k]]
                values, counts = np.unique(close_labels, return_counts=True)
                total_counts = np.sum(counts)
                probs = np.zeros((self._unique_y.shape[0], ))
                for i in range(values.shape[0]):
                    ind = np.where(self._unique_y == values[i])[0]
                    probs[ind] = counts[i] / total_counts
                probabilities.append(probs)
            probabilities = np.array(probabilities)
        else:
            probabilities = np.tile(self._unique_y_probabilities,
                                    (X.shape[0], 1))
        return probabilities
Ejemplo n.º 29
0
def _validate_input_lasso_path(dataset: np.ndarray, target: np.ndarray,
                               weights: Union[np.ndarray, None],
                               features_number: Union[int, None],
                               features_percentage: int) -> bool:
    """
    Validates the input parameters of the ``lasso_path`` function.

    For the input parameter description, warnings and exceptions please see
    the documentation of the
    :func:`fatf.utils.data.feature_selection.sklearn.lasso_path` function.

    Returns
    -------
    input_is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    input_is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input data set must be a 2-dimensional '
                                  'array.')
    if not fuav.is_numerical_array(dataset):
        raise TypeError('The input data set must be purely numerical. (The '
                        'lasso path feature selection is based on '
                        'sklearn.linear_model.lars_path function.)')

    if not fuav.is_1d_array(target):
        raise IncorrectShapeError('The target array must be a 1-dimensional '
                                  'array.')
    if not fuav.is_numerical_array(target):
        raise TypeError('The target array must be numerical since this '
                        'feature selection method is based on Lasso '
                        'regression.')
    if target.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of labels in the target array '
                                  'must agree with the number of samples in '
                                  'the data set.')

    if weights is not None:
        if not fuav.is_1d_array(weights):
            raise IncorrectShapeError('The weights array must 1-dimensional.')
        if not fuav.is_numerical_array(weights):
            raise TypeError('The weights array must be purely numerical.')
        if weights.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of weights in the weights '
                                      'array must be the same as the number '
                                      'of samples in the input data set.')

    if features_number is not None:
        if not isinstance(features_number, int):
            raise TypeError('The features_number parameter must be an '
                            'integer.')
        if features_number < 1:
            raise ValueError('The features_number parameter must be a '
                             'positive integer.')

    if not isinstance(features_percentage, int):
        raise TypeError('The feature_percentage parameter must be an integer.')
    if features_percentage < 0 or features_percentage > 100:
        raise ValueError('The feature_percentage parameter must be between 0 '
                         'and 100 (inclusive).')

    input_is_valid = True
    return input_is_valid
Ejemplo n.º 30
0
    def occlude_segments_vectorised(
            self,
            vectorised_segments_subset: np.ndarray,
            image: Optional[np.ndarray] = None,
            colour: Optional[Union[str, int, RGBcolour]] = None) -> np.ndarray:
        """
        Generates multiple images with a selected subsets of segments occluded.

        The segments to be occluded are provided as boolean vectors;
        either a 1-D numpy array of length equal to the number of segments
        to produce a single occluded image, or a 2-D array where each row
        represents a separate occlusion pattern.
        In this format the n-th element or column corresponds to the the
        n+1 segment id;
        1 indicates that the segment should be preserved and 0 that it should
        be occluded.

        The occlusion is applied on top of the image used to initialise this
        class; alternatively, an external ``image`` of the same type and
        dimensions can be supplied.
        If a colouring strategy different to the one of the class is desired,
        it can be specified via the ``colour`` parameter.

        Parameters
        ----------
        vectorised_segments_subset : numpy.ndarray
            A 1-D boolean occlusion vector of the length equal to the number of
            segments or a 2-D boolean matrix of the (number of occlusion images
            to generate X number of segments) shape.
        image : numpy.ndarray, optional (default=None)
            If provided, this ``image`` will be occluded instead of the one
            used to initialise this class.
        colour : string, integer, tuple(integer, integer, integer), \
optional (default=None)
            A colour specifier.
            By default (``colour=None``) the colouring strategy of the class is
            used.
            See the documentation of the
            :func:`fatf.utils.data.occlusion.Occlusion.set_colouring_strategy`
            method for more details.

        Raises
        ------
        IncorrectShapeError
            The ``vectorised_segments_subset`` numpy array is neither 1- nor
            2-dimensional.
            The number of elements in ``vectorised_segments_subset`` (when it
            is 1-D) does not correspond to the number of segments.
            The number of columns in ``vectorised_segments_subset`` (when it is
            2-D) does not correspond to the number of segments.
            The input ``image`` is neither a 2- nor 3-dimensional numpy array.
            The the height, width or the number of channels in the ``image``
            array does not agree with the same parameters of the class image.
        TypeError
            The ``vectorised_segments_subset`` numpy array is not boolean.

        Returns
        -------
        image_occluded : numpy.ndarray
            A numpy array holding the image(s) with the selected subset(s) of
            segments occluded.
        """
        # pylint: disable=too-many-branches
        if image is None:
            canvas = self.image
        else:
            assert (  # yapf: disable
                fuds._validate_image_array(  # pylint: disable=protected-access
                    image, 'image')), 'Invalid image.'
            if image.shape != self.image.shape:
                raise IncorrectShapeError(
                    'The width, height or number of channels of the input '
                    'image does not agree with the same parameters of the '
                    'original image.')
            canvas = image

        if colour is None:
            colouring_strategy = self._colouring_strategy
        else:
            colouring_strategy = self._generate_colouring_strategy(colour)

        if fuav.is_structured_array(vectorised_segments_subset):
            raise TypeError('The vector representation of segments cannot be '
                            'a structured numpy array.')
        if not fuav.is_numerical_array(vectorised_segments_subset):
            raise TypeError('The vector representation of segments should be '
                            'a numerical numpy array.')
        if fuav.is_1d_array(vectorised_segments_subset):
            if vectorised_segments_subset.shape[0] != self.segments_number:
                raise IncorrectShapeError(
                    ('The number of elements ({}) in the vector '
                     'representation of segments should correspond to the '
                     'unique number of segments ({}).').format(
                         vectorised_segments_subset.shape[0],
                         self.segments_number))
            samples = 1
            vectorised_segments_subset = np.asarray(
                [vectorised_segments_subset])
        elif fuav.is_2d_array(vectorised_segments_subset):
            if vectorised_segments_subset.shape[1] != self.segments_number:
                raise IncorrectShapeError(
                    ('The number of columns ({}) in the vector representation '
                     'of segments should correspond to the unique number of '
                     'segments ({}).').format(
                         vectorised_segments_subset.shape[1],
                         self.segments_number))
            samples = vectorised_segments_subset.shape[0]
        else:
            raise IncorrectShapeError(
                'The vector representation of segments should be a 1- or '
                '2-dimensional numpy array.')
        _unique_entries = set(np.unique(vectorised_segments_subset).astype(
            int)).difference((0, 1))  # yapf: disable
        if _unique_entries:
            raise TypeError('The vector representation of segments should be '
                            'binary numpy array.')

        # image_occluded = canvas.copy()
        image_occluded = np.repeat(canvas[np.newaxis, :], samples, axis=0)
        for i, vec in enumerate(vectorised_segments_subset):
            # Get ids of segments to be occluded (0s) from a vector form
            # 1 is added as segments are numbered from 1, not 0
            segments_subset = np.where(vec == 0)[0] + 1
            occlusion_mask = fuds.get_segment_mask(segments_subset.tolist(),
                                                   self.segments)
            image_occluded[i, occlusion_mask] = colouring_strategy(
                occlusion_mask)
        if samples == 1:
            image_occluded = image_occluded[0]

        return image_occluded