コード例 #1
0
def hamming_distance(x: Union[np.ndarray, np.void],
                     y: Union[np.ndarray, np.void],
                     **kwargs: bool) -> Union[int, float]:
    """
    Computes the Hamming distance between 1-dimensional non-numerical arrays.

    Each of the input arrays can be either a 1D numpy array or a row of a
    structured numpy array, i.e. numpy's void.

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional and non-numerical).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional and non-numerical).
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are of a
        different length.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance : Union[integer, float]
        Hamming distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if not fuav.is_textual_array(x_array):
        raise ValueError('The x array should be textual.')
    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError('The x and y arrays should have the same '
                                  'length.')

    def kw_hamming_distance(vec):
        return hamming_distance_base(vec[0], vec[1], **kwargs)

    distance = np.apply_along_axis(kw_hamming_distance, 0,
                                   np.vstack((x_array, y_array)))
    distance = distance.sum()
    return distance
コード例 #2
0
def binary_distance(x: Union[np.ndarray, np.void],
                    y: Union[np.ndarray, np.void],
                    normalise: bool = False) -> Union[int, float]:
    """
    Computes the binary distance between two 1-dimensional arrays.

    The distance is incremented by one for every position in the two input
    arrays where the value does not match. Each of the input arrays can be
    either a 1D numpy array or a row of a structured numpy array, i.e. numpy's
    void.

    Either of the input arrays is not of a base dtype. (See
    :func:`fatf.utils.array.validation.is_base_array` function description for
    the explanation of a base dtype.)

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional).
    normalise : boolean, optional (default=False)
        Whether to normalise the binary distance using the input array length.

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are of a
        different length.

    Returns
    -------
    distance : Union[integer, float]
        Binary distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError('The x and y arrays should have the same '
                                  'length.')

    distance = (x_array != y_array).sum()
    if normalise:
        logger.debug('Binary distance is being normalised.')
        distance /= x_array.shape[0]
    return distance
コード例 #3
0
def euclidean_distance(x: Union[np.ndarray, np.void],
                       y: Union[np.ndarray, np.void]) -> float:
    """
    Calculates the Euclidean distance between two 1-dimensional numpy "arrays".

    Each of the input arrays can be either a 1D numpy array or a row of a
    structured numpy array, i.e. numpy's void.

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional and purely numerical).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional and purely numerical).

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are not of the
        same length.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distance : float
        Euclidean distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if not fuav.is_numerical_array(x_array):
        raise ValueError('The x array should be purely numerical.')
    if not fuav.is_numerical_array(y_array):
        raise ValueError('The y array should be purely numerical.')

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError(('The x and y arrays should have the same '
                                   'length.'))

    distance = np.linalg.norm(x_array - y_array)
    return distance
コード例 #4
0
def hamming_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and non-numerical) used to
        calculate the distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and non-numerical) to which
        rows the distances are calculated.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distances : numpy.ndarray
        An array of Hamming distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')
    if not fuav.is_textual_array(X_array):
        raise ValueError('The X array should be textual.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(hamming_distance, 1, X_array, y_array,
                                    **kwargs)
    return distances
コード例 #5
0
def _validate_input(data_row: Union[np.ndarray, np.void],
                    samples_number: int) -> bool:
    """
    Validates input parameters of an instance sampler function.

    This function validates ``data_row`` and ``samples_number`` input
    parameters. For the description of input parameters and errors please see
    the documentation of the
    :func:`fatf.utils.data.instance_augmentation.binary_sampler` function.

    Returns
    -------
    is_valid : boolean
        ``True`` if input parameters are valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured rows.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number parameter must be a positive '
                             'integer.')
    else:
        raise TypeError('The samples_number parameter must be an integer.')

    is_valid = True
    return is_valid
コード例 #6
0
    def _validate_data_point(self, data_point: DataRow, clip: bool) -> bool:
        """
        Validates input parameters of the ``score_data_point`` method.

        Parameters
        ----------
        data_point : Union[numpy.array, numpy.void]
            A data row. For numpy arrays this will be a numpy ndarray. For
            structured numpy arrays this will be numpy void.

        Raises
        ------
        IncorrectShapeError
            The data point is not 1-dimensional numpy array (either numpy
            ndarray for classic numpy arrays or numpy void for structured numpy
            arrays). The data point does not have the same number of columns
            (features) as the data set used to initialise this class.
        TypeError
            The data point is not of a base type (strings and/or numbers). The
            dtype of the data point is too different from the dtype of the
            data set used to initialise this class. The ``clip`` parameter is
            not a boolean.

        Returns
        -------
        is_valid : boolean
            ``True`` if the input parameters are valid, ``False`` otherwise.
        """
        is_valid = False

        if not fuav.is_1d_like(data_point):
            raise IncorrectShapeError('The data point has to be 1-dimensional '
                                      'numpy array or numpy void (for '
                                      'structured arrays).')
        data_point_array = np.asarray([data_point])
        if not fuav.is_base_array(data_point_array):
            raise TypeError('The data point has to be of a base type (strings '
                            'and/or numbers).')
        if not fuav.are_similar_dtype_arrays(self.data_set, data_point_array):
            raise TypeError('The dtypes of the data set used to initialise '
                            'this class and the provided data point are too '
                            'different.')
        # Testing only for unstructured as the dtype comparison picks up on a
        # different number of columns in a structured array
        if not self._is_structured:
            if self.data_set.shape[1] != data_point_array.shape[1]:
                raise IncorrectShapeError('The data point has different '
                                          'number of columns (features) than '
                                          'the data set used to initialise '
                                          'this class.')

        if not isinstance(clip, bool):
            raise TypeError('The clip parameter has to be a boolean.')

        is_valid = True
        return is_valid
コード例 #7
0
def euclidean_point_distance(y: Union[np.ndarray, np.void],
                             X: np.ndarray) -> np.ndarray:
    """
    Calculates the Euclidean distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and purely numerical) used to
        calculate distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and purely numerical) to which
        rows distances are calculated.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distances : numpy.ndarray
        An array of Euclidean distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_numerical_array(y_array):
        raise ValueError('The y array should be purely numerical.')
    if not fuav.is_numerical_array(X_array):
        raise ValueError('The X array should be purely numerical.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(euclidean_distance, 1, X_array, y_array)
    return distances
コード例 #8
0
def _validate_input_drm(dataset: np.ndarray, data_row: Union[np.ndarray,
                                                             np.void]) -> bool:
    """
    Validates :func:`fatf.utils.data.transformation.dataset_row_masking` input.

    This function checks if ``dataset`` is a 2-dimensional array and if
    ``data_row`` is a 1-dimensional array of the same length as the number of
    columns in the ``dataset``. It also checks if they have valid and
    compatible dtypes.

    For the description of input parameters, and warnings and exceptions raised
    by this function please see the  documentation of the
    :func:`fatf.utils.data.transformation.dataset_row_masking` function.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- text, '
                        'numbers or mixture of the two.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data row must either be a '
                                  '1-dimensional numpy array or a numpy void '
                                  'object for structured rows.')

    # For structured arrays the dtype check also checks the number of columns
    are_similar = fuav.are_similar_dtype_arrays(dataset,
                                                np.array([data_row]),
                                                strict_comparison=False)
    if not are_similar:
        raise TypeError('The dtype of the data row is too different from the '
                        'dtype of the dataset provided.')

    # Since the types agree both, the row and the data set, have to be
    # structured or plane
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data row must contain the same '
                                      'number of elements as the number of '
                                      'columns in the provided dataset.')

    is_valid = True
    return is_valid
コード例 #9
0
    def discretise(
            self, dataset: Union[np.ndarray,
                                 np.void]) -> Union[np.ndarray, np.void]:
        """
        Discretises numerical features of the ``dataset`` into quartiles.

        Parameters
        ----------
        dataset : Union[numpy.ndarray, numpy.void]
            A data point (1-D) or an array (2-D) of data points to be
            discretised.
        Raises
        ------
        IncorrectShapeError
            The input ``dataset`` is neither 1- nor 2-dimensional numpy array.
            The number of features (columns) in the input ``dataset`` is
            different than the number of features in the dataset used to
            initialise this object.
        TypeError
            The dtype of the input ``dataset`` is too different from the dtype
            of the dataset used to initialise this object.

        Returns
        -------
        discretised_data : Union[numpy.ndarray, numpy.void]
            A discretised data array.
        """
        self._validate_input_discretise(dataset)

        if self.is_structured and fuav.is_1d_like(dataset):
            discretised_dataset = dataset.copy().astype(self.discretised_dtype)
        else:
            discretised_dataset = np.zeros_like(dataset,
                                                dtype=self.discretised_dtype)

        for feature in self.categorical_indices:
            if self.is_structured or fuav.is_1d_array(dataset):
                discretised_dataset[feature] = dataset[feature]
            else:
                discretised_dataset[:, feature] = dataset[:, feature]

        for feature, boundaries in self.feature_bin_boundaries.items():
            if self.is_structured or fuav.is_1d_array(dataset):
                discretised_dataset[feature] = np.searchsorted(
                    boundaries, dataset[feature])
            else:
                discretised_dataset[:, feature] = np.searchsorted(
                    boundaries, dataset[:, feature])

        return discretised_dataset
コード例 #10
0
def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:`fatf.transparency.data.describe_functions.\
describe_categorical_array` functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:`fatf.transparency.\
data.describe_functions.describe_numerical_array` function responsible for
        describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int], Dict[str, \
Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore
コード例 #11
0
def _validate_input_local_fidelity(
        dataset: np.ndarray, data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Union[int, None],
        explained_feature_indices: Union[List[IndexType], None],
        fidelity_radius_percentage: int, samples_number: int) -> bool:
    """
    Validates the input parameters for the ``local_fidelity_score`` function.

    This function validates input parameter of the
    :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score`
    function. The description of this function's input parameters, errors and
    exceptions can be found therein.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- numbers '
                        'and/or strings.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured data rows.')

    are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]))
    if not are_similar:
        raise TypeError('The dtype of the data_row is too different from '
                        'the dtype of the dataset array.')

    # If the dataset is structured and the data_row has a different
    # number of features this will be caught by the above dtype check.
    # For classic numpy arrays this has to be done separately.
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data_row must contain the same '
                                      'number of features as the dataset.')

    if callable(global_predictive_function):
        global_params_n = fuv.get_required_parameters_number(
            global_predictive_function)
        if global_params_n != 1:
            raise IncompatibleModelError(
                'The global predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The global_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(local_predictive_function):
        local_params_n = fuv.get_required_parameters_number(
            local_predictive_function)
        if local_params_n != 1:
            raise IncompatibleModelError(
                'The local predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The local_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(metric_function):
        if fuv.get_required_parameters_number(metric_function) != 2:
            raise TypeError('The metric_function must take exactly *two* '
                            'required parameters.')
    else:
        raise TypeError('The metric_function should be a Python callable, '
                        'e.g., a Python function.')

    # Explained class index
    global_prediction = global_predictive_function(dataset[:1])
    assert not fuav.is_structured_array(global_prediction), 'Must be plain.'
    assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.'
    if fuav.is_2d_array(global_prediction):  # A probabilistic model.
        if explained_class_index is not None:
            if isinstance(explained_class_index, int):
                if (explained_class_index >= global_prediction.shape[1]
                        or explained_class_index < 0):
                    raise ValueError('The explained_class_index parameter is '
                                     'negative or larger than the number of '
                                     'classes output by the global '
                                     'probabilistic model.')
            else:
                raise TypeError('For probabilistic global models, i.e., '
                                'global predictive functions, the '
                                'explained_class_index parameter has to be an '
                                'integer or None.')
    elif fuav.is_1d_array(global_prediction):
        if explained_class_index is not None:
            warnings.warn(
                'The explained_class_index parameter is not None and will be '
                'ignored since the global model is not probabilistic.',
                UserWarning)
    else:
        assert False, ('Global predictor must output a 1- or 2-dimensional '
                       'numpy array.')  # pragma: nocover

    if explained_feature_indices is not None:
        if isinstance(explained_feature_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(explained_feature_indices))
            if invalid_indices.size:
                raise IndexError(
                    'The following column indices are invalid for the input '
                    'dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The explained_feature_indices parameter must be '
                            'a Python list or None.')

    if isinstance(fidelity_radius_percentage, int):
        if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100:
            raise ValueError('The fidelity_radius_percentage must be an '
                             'integer between 1 and 100.')
    else:
        raise TypeError('The fidelity_radius_percentage must be an integer '
                        'between 1 and 100.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number must be a positive integer.')
    else:
        raise TypeError('The samples_number must be an integer.')

    is_input_ok = True
    return is_input_ok
コード例 #12
0
ファイル: augmentation.py プロジェクト: enrsr/fat-forensics-1
    def _validate_sample_input(self, data_row: Union[None, np.ndarray,
                                                     np.void],
                               samples_number: int) -> bool:
        """
        Validates input parameters of the ``sample`` method.

        This function checks the validity of ``data_row`` and
        ``samples_number`` parameters.

        Raises
        ------
        IncorrectShapeError
            The ``data_row`` is not a 1-dimensional numpy array-like object.
            The number of features (columns) in the ``data_row`` is different
            to the number of features in the data array used to initialise this
            object.
        TypeError
            The dtype of the ``data_row`` is different than the dtype of the
            data array used to initialise this object. The ``samples_number``
            parameter is not an integer.
        ValueError
            The ``samples_number`` parameter is not a positive integer.

        Returns
        -------
        is_valid : boolean
            ``True`` if input parameters are valid, ``False`` otherwise.
        """
        is_valid = False

        if data_row is not None:
            if not fuav.is_1d_like(data_row):
                raise IncorrectShapeError('The data_row must either be a '
                                          '1-dimensional numpy array or numpy '
                                          'void object for structured rows.')

            are_similar = fuav.are_similar_dtype_arrays(self.dataset,
                                                        np.array([data_row]),
                                                        strict_comparison=True)
            if not are_similar:
                raise TypeError('The dtype of the data_row is different to '
                                'the dtype of the data array used to '
                                'initialise this class.')

            # If the dataset is structured and the data_row has a different
            # number of features this will be caught by the above dtype check.
            # For classic numpy arrays this has to be done separately.
            if not self.is_structured:
                if data_row.shape[0] != self.dataset.shape[1]:
                    raise IncorrectShapeError('The data_row must contain the '
                                              'same number of features as the '
                                              'dataset used to initialise '
                                              'this class.')

        if isinstance(samples_number, int):
            if samples_number < 1:
                raise ValueError('The samples_number parameter must be a '
                                 'positive integer.')
        else:
            raise TypeError('The samples_number parameter must be an integer.')

        is_valid = True
        return is_valid
コード例 #13
0
ファイル: distances.py プロジェクト: enrsr/fat-forensics-1
def get_point_distance(
        data_array: np.ndarray, data_point: Union[np.ndarray, np.void],
        distance_function: Callable[[np.ndarray, np.ndarray], float]
) -> np.ndarray:
    """
    Computes the distance between a data point and an array of data.

    This function computes the distances between the ``data_point`` and all
    rows of the ``data_array``.

    Parameters
    ----------
    data_array : numpy.ndarray
        A 2-dimensional numpy array to which rows distances will be computed.
    data_point : Union[numpy.ndarray, numpy.void]
        A 1-dimensional numpy array or numpy void (for structured data points)
        for which distances to every row of the ``data_array`` will be
        computed.
    distance_function : Callable[[numpy.ndarray, numpy.ndarray], number]
        A Python function that takes as an input two 1-dimensional numpy arrays
        of equal length and outputs a number representing a distance between
        them. **The distance function is assumed to return the same distance
        regardless of the order in which parameters are given.**

    Raises
    ------
    AttributeError
        The distance function does not require exactly two parameters.
    IncorrectShapeError
        The data array is not a 2-dimensional numpy array. The data point is
        not 1-dimensional. The number of columns in the data array is different
        to the number of elements in the data point.
    TypeError
        The data array or the data point is not of a base type (numbers and/or
        strings). The data point and the data array have incomparable dtypes.
        The distance function is not a Python callable (function).

    Returns
    -------
    distances : numpy.ndarray
        A 1-dimensional numerical numpy array with distances between
        ``data_point`` and every row of the ``data_array``.
    """
    assert _validate_get_distance(data_array,
                                  distance_function), 'Invalid input.'

    is_structured = fuav.is_structured_array(data_array)

    if not fuav.is_1d_like(data_point):
        raise IncorrectShapeError('The data point has to be 1-dimensional '
                                  'numpy array or numpy void (for structured '
                                  'arrays).')
    data_point_array = np.asarray([data_point])
    if not fuav.is_base_array(data_point_array):
        raise TypeError('The data point has to be of a base type (strings '
                        'and/or numbers).')
    if not fuav.are_similar_dtype_arrays(data_array, data_point_array):
        raise TypeError('The dtypes of the data set and the data point are '
                        'too different.')
    # Testing only for unstructured as the dtype comparison picks up on a
    # different number of columns in a structured array
    if not is_structured:
        if data_array.shape[1] != data_point_array.shape[1]:
            raise IncorrectShapeError('The data point has different number of '
                                      'columns (features) than the data set.')

    if is_structured:
        distances = np.zeros((data_array.shape[0], ), dtype=np.float64)
        for row_i in range(data_array.shape[0]):
            distances[row_i] = distance_function(data_array[row_i], data_point)
    else:
        distances = np.apply_along_axis(distance_function, 1, data_array,
                                        data_point)

    return distances
コード例 #14
0
    def explain_instance(
            self, instance: np.ndarray, **kwargs: Any
    ) -> Union[Dict[str, Tuple[str, float]], List[Tuple[str, float]]]:
        """
        Explains an instance with the LIME tabular explainer.

        This method wraps around ``explain_instance`` method_ in the LIME
        tabular explainer object.

        .. warning::
            Contrarily to the LIME tabular explainer this wrapper produces
            explanations for all of the classes for a classification task by
            default.

        If any of the named parameters for this function were specified when
        initialising this object they will be used unless they are also defined
        when calling this method, in which case the latter take the precedence.

        If all: a class-wide model, a class-wide prediction function and a
        local prediction function (via named parameter to this function) are
        specified, they are used in the following order:

        - local prediction function,

        - global prediction function, and finally

        - the model.

        Based on whether the task at hand is classification or regression
        either ``predict`` (regression) or ``predict_proba`` (classification)
        method of the model is used.

        .. _method: https://lime-ml.readthedocs.io/en/latest/lime.html
           #lime.lime_tabular.LimeTabularExplainer.explain_instance

        Parameters
        ----------
        instance : numpy.ndarray
            A 1-dimensional data point (numpy array) to be explained.
        **kwargs : lime.lime_tabular.LimeTabularExplainer.explain_instance
            LIME tabular explainer's ``explain_instance`` optional parameters.

        Raises
        ------
        AttributeError
            One of the named parameters is invalid for the ``explain_instance``
            method of the LIME tabular explainer.
        IncorrectShapeError
            The input ``instance`` is not a 1-dimensional numpy array.
        RuntimeError
            A predictive function is not available (neither as a ``model``
            attribute of this class, nor as a ``predict_fn`` parameter).
        ValueError
            The input ``instance`` is not purely numerical.

        Returns
        -------
        explanation : Dictionary[string, Tuple[string, float]] or \
List[Tuple[string, float]]
            For classification a dictionary where the keys correspond to class
            names and the values are tuples (string and float), which represent
            an explanation in terms of one of the features and the importance
            of this explanation. For regression a list of tuples (string and
            float) with the same meaning.
        """
        # pylint: disable=too-many-locals,too-many-branches
        invalid_params = set(kwargs.keys()).difference(
            self._EXPLAIN_INSTANCE_PARAMS)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        if not fuav.is_1d_like(instance):
            raise IncorrectShapeError('The instance to be explained should be '
                                      '1-dimensional.')
        instance = fuat.as_unstructured(instance)
        if not fuav.is_numerical_array(instance):
            raise ValueError('The instance to be explained should be purely '
                             'numerical -- LIME does not support categorical '
                             'features.')

        # Merge local kwargs and object's kwargs
        named_arguments = dict(self.explain_instance_params)
        for kwarg in self._EXPLAIN_INSTANCE_PARAMS:
            if kwarg in kwargs:
                named_arguments[kwarg] = kwargs[kwarg]

        # If both a model and a predictor function is supplied
        pred_fn_name = 'predict_fn'
        if pred_fn_name in named_arguments:
            pred_fn = named_arguments[pred_fn_name]
            del named_arguments[pred_fn_name]
        elif self.model is not None:
            if self.mode == 'classification':
                if self.model_is_probabilistic:
                    pred_fn = self.model.predict_proba  # type: ignore
                else:
                    raise RuntimeError('The predictive model is not '
                                       'probabilistic. Please specify a '
                                       'predictive function instead.')
            else:
                pred_fn = self.model.predict  # type: ignore
        else:
            raise RuntimeError('A predictive function is not available.')

        # If unspecified, get explanations for all classes for classification
        lbls_name = 'labels'
        if lbls_name not in named_arguments and self.mode == 'classification':
            # Since we cannot get all of the class names/indices/quantity,
            # we need to resort to this dirty trick
            n_classes = pred_fn(np.array([instance])).shape[1]
            named_arguments[lbls_name] = range(n_classes)

        exp = self.tabular_explainer.explain_instance(instance, pred_fn,
                                                      **named_arguments)

        if self.mode == 'classification':
            explanation = {}
            for label in exp.available_labels():
                class_name = exp.class_names[label]
                class_explanation = exp.as_list(label=label)

                explanation[class_name] = class_explanation
        else:
            explanation = exp.as_list()

        return explanation
コード例 #15
0
def describe_numerical_array(
        array: Union[np.ndarray, np.void],
        skip_nans: bool = True) -> Dict[str, Union[int, float, np.ndarray]]:
    """
    Describes a numerical numpy array with basic statistics.

    If the ``skip_nans`` parameter is set to ``True``, any ``numpy.nan``
    present in the input array is skipped for calculating the statistics.
    Otherwise, they are included, affecting most of the statistics and possibly
    equating them to ``numpy.nan``.

    The description output by this function is a dictionary with the
    following keys:

    ``count`` : integer
        The number of elements in the array.

    ``mean`` : float
        The *mean* (average) value of the array.

    ``std`` : float
        The *standard deviation* of the array.

    ``min`` : float
        The *minimum value* in the array.

    ``25%`` : float
        The *25 percentile* of the array.

    ``50%`` : float
        The *50 percentile* of the array, which is equivalent to its
        **median**.

    ``75%`` : float
        The *75 percentile* of the array.

    ``max`` : float
        The *maximum value* in the array.

    ``nan_count`` : integer
        The count of ``numpy.nan`` (not-a-number) values in the array.

    Parameters
    ----------
    array : Union[numpy.ndarray, numpy.void]
        An array for which a description is desired.
    skip_nans : boolean, optional (default=True)
        If set to ``True``, ``numpy.nan``\\ s present in the input array will
        be excluded while computing the statistics.

    Raises
    ------
    IncorrectShapeError
        The input array is not 1-dimensional.
    ValueError
        The input array is not purely numerical or it is empty.

    Returns
    -------
    numerical_description : Dict[string, Union[integer, float, numpy.ndarray]]
        A dictionary describing the numerical input array.
    """
    if not fuav.is_1d_like(array):
        raise IncorrectShapeError('The input array should be 1-dimensional.')

    classic_array = fuat.as_unstructured(array)
    assert len(classic_array.shape) == 1, '1D arrays only at this point.'

    if not classic_array.shape[0]:
        raise ValueError('The input array cannot be empty.')
    if not fuav.is_numerical_array(classic_array):
        raise ValueError('The input array should be purely numerical.')

    nan_indices = np.isnan(classic_array)
    n_elements = classic_array.shape[0]

    if skip_nans:
        classic_array = classic_array[~nan_indices]

    numerical_description = {
        'count': n_elements,
        'mean': np.mean(classic_array),
        'std': np.std(classic_array),
        'min': np.min(classic_array),
        '25%': np.percentile(classic_array, 25),
        '50%': np.percentile(classic_array, 50),
        '75%': np.percentile(classic_array, 75),
        'max': np.max(classic_array),
        'nan_count': nan_indices.sum()
    }

    return numerical_description
コード例 #16
0
def describe_categorical_array(
    array: Union[np.ndarray, np.void]
) -> Dict[str, Union[str, int, bool, np.ndarray]]:
    """
    Describes a categorical numpy array with basic statistics.

    The description output by this function is a dictionary with the
    following keys:

    ``count`` : integer
        The number of elements in the array.

    ``unique`` : numpy.ndarray
        The unique values in the array, ordered lexicographically.

    ``unique_counts`` : numpy.ndarray
        The counts of the unique values in the array.

    ``top`` : string
        The most frequent value in the array.

    ``freq`` : integer
        The count of the most frequent value in the array.

    ``is_top_unique`` : boolean
        Indicates whether the most frequent value (``freq``) in the array is
        the only one with that count.

    Parameters
    ----------
    array : Union[numpy.ndarray, numpy.void]
        An array for which a description is desired.

    Raises
    ------
    IncorrectShapeError
        The input array is not 1-dimensinoal.
    ValueError
        The input array is empty.

    Warns
    -----
    UserWarning
        When the input array is not purely textual it needs to be converted to
        a string type before it can be described.

    Returns
    -------
    categorical_description : Dict[string, Union[string, integer, \
boolean, numpy.ndarray]]
        A dictionary describing the categorical input array.
    """
    if not fuav.is_1d_like(array):
        raise IncorrectShapeError('The input array should be 1-dimensional.')

    classic_array = fuat.as_unstructured(array)
    assert len(classic_array.shape) == 1, '1D arrays only at this point.'

    if not classic_array.shape[0]:
        raise ValueError('The input array cannot be empty.')
    if not fuav.is_textual_array(classic_array):
        warnings.warn(
            'The input array is not purely categorical. Converting the input '
            'array into a textual type to facilitate a categorical '
            'description.',
            category=UserWarning)
        classic_array = classic_array.astype(str)

    unique, unique_counts = np.unique(classic_array, return_counts=True)

    unique_sort_index = np.argsort(unique)
    unique = unique[unique_sort_index]
    unique_counts = unique_counts[unique_sort_index]

    top_index = np.argmax(unique_counts)

    top = unique[top_index]
    freq = unique_counts[top_index]

    is_top_unique = (unique_counts == freq).sum() < 2

    categorical_description = {
        'count': classic_array.shape[0],
        'unique': unique,
        'unique_counts': unique_counts,
        'top': top,
        'freq': freq,
        'is_top_unique': is_top_unique
    }

    return categorical_description
コード例 #17
0
    def _validate_input_discretise(
            self, dataset: Union[np.ndarray, np.void]) -> bool:
        """
        Validates the input parameters of the ``discretise`` method.

        This method checks the validity of the input ``dataset``, which can be
        either a 1-D or a 2-D array with *similar* dtype to the data array
        used to initialise this class.

        Parameters
        ----------
        dataset : Union[numpy.ndarray, numpy.void]
            A data point (1-D array) or a data set (2-D array) to be
            discretised.

        Raises
        ------
        IncorrectShapeError
            The input ``dataset`` is neither 1- nor 2-dimensional numpy array.
            The number of features (columns) in the input ``dataset`` is
            different than the number of features in the dataset used to
            initialise this object.
        TypeError
            The dtype of the input ``dataset`` is too different from the dtype
            of the dataset used to initialise this object.

        Returns
        -------
        is_valid : boolean
            ``True`` if the input parameter is valid, ``False`` otherwise.
        """
        is_valid = False

        if not (fuav.is_1d_like(dataset) or fuav.is_2d_array(dataset)):
            raise IncorrectShapeError('The dataset must be either a '
                                      '1-dimensional (a plane numpy array or '
                                      'numpy void for structured '
                                      '1-dimensional arrays) or a '
                                      '2-dimensional array.')

        are_similar = fuav.are_similar_dtype_arrays(
            np.empty((0, ), dtype=self.dataset_dtype),
            np.array(dataset),
            strict_comparison=False)
        if not are_similar:
            raise TypeError('The dtype of the input dataset is too different '
                            'from the dtype of the dataset used to initialise '
                            'this class.')
        # The dimensions of a structured array are automatically compared above
        if not self.is_structured:
            if fuav.is_1d_like(dataset):
                features_number = dataset.shape[0]
            else:
                features_number = dataset.shape[1]

            if features_number != self.features_number:
                raise IncorrectShapeError('The input dataset must contain the '
                                          'same number of features as the '
                                          'dataset used to initialise this '
                                          'class.')

        is_valid = True
        return is_valid