Example #1
0
def _validate_input_dc(data_set: np.ndarray,
                       categorical_indices: Union[None, List[Index]],
                       neighbours: int,
                       distance_function: Union[None, DistanceFunction],
                       normalise_scores: bool) -> bool:
    """
    Validates ``DensityCheck`` class initialiser's input parameters.

    Parameters
    ----------
    data_set : numpy.ndarray
        A 2-dimensional numpy array (either classic or structured) of a base
        type.
    categorical_indices : Union[None, List[column index]],
        Either ``None`` or a list of column indices to be treated as
        categorical.
    neighbours : integer
        The number of closest neighbours to be considered.
    distance_function : Union[None, Callable[[data row, data row], number]]
        Either ``None`` or a Python function that calculates a distance between
        two data points. This function takes as an input two 1-dimensional
        numpy arrays (for classic numpy arrays) or numpy voids (fro structured
        numpy arrays) of equal length and outputs a number representing a
        distance between them. **The distance function is assumed to return the
        same distance regardless of the order in which the input parameters are
        given.**
    normalise_scores : boolean
        A boolean parameter indicating whether to normalise the scores
        (``True``) or not (``False``).

    Raises
    ------
    AttributeError
        The distance function does not require exactly 2 non-optional
        parameters.
    IncorrectShapeError
        The ``data_set`` array is not 2-dimensional.
    IndexError
        Some of the provided categorical column indices are invalid for the
        ``data_set`` array.
    TypeError
        The ``data_set`` array is not of a base type (strings and/or numbers).
        The ``neighbours`` parameter is not an integer. The
        ``distance_function`` is neither ``None`` nor Python callable (a
        function). The ``normalise_scores`` parameter is not a boolean. The
        ``categorical_indices`` parameter is not a Python list.
    ValueError
        The ``neighbours`` parameter is smaller than 1 or larger than the
        number of instances (rows) in the ``data_set`` array.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(data_set):
        raise IncorrectShapeError('The data set should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(data_set):
        raise TypeError('The data set is not of a base type (numbers and/or '
                        'strings.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                data_set, np.asarray(categorical_indices)).tolist()
            if invalid_indices:
                raise IndexError('The following indices are invalid for the '
                                 'input data set: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if isinstance(neighbours, int):
        if neighbours < 1 or neighbours > data_set.shape[0]:
            raise ValueError('The neighbours number parameter has to be '
                             'between 1 and number of data points (rows) in '
                             'the data set array.')
    else:
        raise TypeError('The neighbours number parameter has to be an '
                        'integer.')

    if distance_function is not None:
        if callable(distance_function):
            required_param_n = 0
            params = inspect.signature(distance_function).parameters
            for param in params:
                if params[param].default is params[param].empty:
                    required_param_n += 1
            if required_param_n != 2:
                raise AttributeError('The distance function must require '
                                     'exactly 2 parameters. Given function '
                                     'requires {} '
                                     'parameters.'.format(required_param_n))
        else:
            raise TypeError('The distance function should be a Python '
                            '(function).')

    if not isinstance(normalise_scores, bool):
        raise TypeError('The normalise scores parameter should be a boolean.')

    is_valid = True
    return is_valid
Example #2
0
def _validate_input(dataset: np.ndarray,
                    ground_truth: Optional[np.ndarray] = None,
                    categorical_indices: Optional[List[Index]] = None,
                    int_to_float: bool = True) -> bool:
    """
    Validates the input parameters of an arbitrary augmentation class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be used for sampling.
    ground_truth : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with labels for the supplied dataset.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    int_to_float : boolean, optional (default=True)
        If ``True``, all of the integer dtype columns in the ``dataset`` will
        be generalised to ``numpy.float64`` type. Otherwise, integer type
        columns will remain integer and floating point type columns will remain
        floating point.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array. The
        ``ground_truth`` array is not a 1-dimensional numpy array. The number
        of ground truth annotation is different than the number of rows in the
        data array.
    IndexError
        Some of the column indices given in the ``categorical_indices``
        parameter are not valid for the input ``dataset``.
    TypeError
        The ``categorical_indices`` parameter is neither a list nor ``None``.
        The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not
        of base (numerical and/or string) type. The ``int_to_float`` parameter
        is not a boolean.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if ground_truth is not None:
        if not fuav.is_1d_array(ground_truth):
            raise IncorrectShapeError('The ground_truth array must be '
                                      '1-dimensional. (Or None if it is not '
                                      'required.)')
        if not fuav.is_base_array(ground_truth):
            raise TypeError('The ground_truth array must be of a base type.')
        if ground_truth.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of labels in the '
                                      'ground_truth array is not equal to the '
                                      'number of data points in the dataset '
                                      'array.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if not isinstance(int_to_float, bool):
        raise TypeError('The int_to_float parameter has to be a boolean.')

    is_valid = True
    return is_valid
Example #3
0
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray,
                  protected_features: List[Index]) -> np.ndarray:
    """
    Checks for systemic bias in a dataset.

    This function checks whether there exist data points that share the same
    unprotected features but differ in protected features. For all of these
    instances their label (ground truth) will be checked and if it is
    different, a particular data points pair will be indicated to be biased.
    This dependency is represented as a boolean, square numpy array that shows
    whether systemic bias exists (``True``) for any pair of data points.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be evaluated for systemic bias.
    ground_truth : numpy.ndarray
        The labels corresponding to the dataset.
    protected_features : List[column index]
        A list of column indices in the dataset that hold protected attributes.

    Raises
    ------
    IncorrectShapeError
        The dataset is not a 2-dimensional numpy array, the ground truth is not
        a 1-dimensional numpy array or the number of rows in the dataset is not
        equal to the number of elements in the ground truth array.
    IndexError
        Some of the column indices given in the ``protected_features`` list are
        not valid for the input dataset.
    TypeError
        The ``protected_features`` parameter is not a list.
    ValueError
        There are duplicate values in the protected feature indices list.

    Returns
    -------
    systemic_bias_matrix : numpy.ndarray
        A square, diagonally symmetrical and boolean numpy array that indicates
        which pair of data point share the same unprotected features but differ
        in protected features and the ground truth annotation.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The dataset should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth should be a 1-dimensional '
                                  'numpy array.')
    if ground_truth.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of rows in the dataset and the '
                                  'ground truth should be equal.')
    if isinstance(protected_features, list):
        pfa = np.asarray(protected_features)
        if not fuat.are_indices_valid(dataset, pfa):
            iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist()
            raise IndexError('The following protected feature indices are not '
                             'valid for the dataset array: {}.'.format(iid))
        if len(set(protected_features)) != len(protected_features):
            raise ValueError('Some of the protected indices are duplicated.')
    else:
        raise TypeError('The protected_features parameter should be a list.')

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        unprotected_features_array = recfn.drop_fields(dataset,
                                                       protected_features)
        if unprotected_features_array is None:
            unprotected_features_array = np.ones((dataset.shape[0], ),
                                                 dtype=[('ones', int)])
    else:
        unprotected_features_array = np.delete(dataset,
                                               protected_features,
                                               axis=1)
        if not unprotected_features_array.size:
            unprotected_features_array = np.ones((dataset.shape[0], 1))

    assert unprotected_features_array.shape[0] == dataset.shape[0], \
        'Must share rows number.'

    systemic_bias_columns = []
    for i in range(unprotected_features_array.shape[0]):
        if is_structured:
            equal_unprotected = (
                unprotected_features_array == unprotected_features_array[i])
        else:
            equal_unprotected = np.apply_along_axis(
                np.array_equal, 1, unprotected_features_array,
                unprotected_features_array[i, :])

        equal_unprotected_indices = np.where(equal_unprotected)

        # Check whether the ground truth is different for these rows
        equal_unprotected[equal_unprotected_indices] = (
            ground_truth[i] != ground_truth[equal_unprotected_indices])
        systemic_bias_columns.append(equal_unprotected)

    systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1)
    assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \
        'The matrix has to be diagonally symmetric.'
    assert not np.diagonal(systemic_bias_matrix).any(), \
        'Same elements cannot be systemically biased.'
    return systemic_bias_matrix
Example #4
0
def _validate_input_local_fidelity(
        dataset: np.ndarray, data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Union[int, None],
        explained_feature_indices: Union[List[IndexType], None],
        fidelity_radius_percentage: int, samples_number: int) -> bool:
    """
    Validates the input parameters for the ``local_fidelity_score`` function.

    This function validates input parameter of the
    :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score`
    function. The description of this function's input parameters, errors and
    exceptions can be found therein.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- numbers '
                        'and/or strings.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured data rows.')

    are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]))
    if not are_similar:
        raise TypeError('The dtype of the data_row is too different from '
                        'the dtype of the dataset array.')

    # If the dataset is structured and the data_row has a different
    # number of features this will be caught by the above dtype check.
    # For classic numpy arrays this has to be done separately.
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data_row must contain the same '
                                      'number of features as the dataset.')

    if callable(global_predictive_function):
        global_params_n = fuv.get_required_parameters_number(
            global_predictive_function)
        if global_params_n != 1:
            raise IncompatibleModelError(
                'The global predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The global_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(local_predictive_function):
        local_params_n = fuv.get_required_parameters_number(
            local_predictive_function)
        if local_params_n != 1:
            raise IncompatibleModelError(
                'The local predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The local_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(metric_function):
        if fuv.get_required_parameters_number(metric_function) != 2:
            raise TypeError('The metric_function must take exactly *two* '
                            'required parameters.')
    else:
        raise TypeError('The metric_function should be a Python callable, '
                        'e.g., a Python function.')

    # Explained class index
    global_prediction = global_predictive_function(dataset[:1])
    assert not fuav.is_structured_array(global_prediction), 'Must be plain.'
    assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.'
    if fuav.is_2d_array(global_prediction):  # A probabilistic model.
        if explained_class_index is not None:
            if isinstance(explained_class_index, int):
                if (explained_class_index >= global_prediction.shape[1]
                        or explained_class_index < 0):
                    raise ValueError('The explained_class_index parameter is '
                                     'negative or larger than the number of '
                                     'classes output by the global '
                                     'probabilistic model.')
            else:
                raise TypeError('For probabilistic global models, i.e., '
                                'global predictive functions, the '
                                'explained_class_index parameter has to be an '
                                'integer or None.')
    elif fuav.is_1d_array(global_prediction):
        if explained_class_index is not None:
            warnings.warn(
                'The explained_class_index parameter is not None and will be '
                'ignored since the global model is not probabilistic.',
                UserWarning)
    else:
        assert False, ('Global predictor must output a 1- or 2-dimensional '
                       'numpy array.')  # pragma: nocover

    if explained_feature_indices is not None:
        if isinstance(explained_feature_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(explained_feature_indices))
            if invalid_indices.size:
                raise IndexError(
                    'The following column indices are invalid for the input '
                    'dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The explained_feature_indices parameter must be '
                            'a Python list or None.')

    if isinstance(fidelity_radius_percentage, int):
        if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100:
            raise ValueError('The fidelity_radius_percentage must be an '
                             'integer between 1 and 100.')
    else:
        raise TypeError('The fidelity_radius_percentage must be an integer '
                        'between 1 and 100.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number must be a positive integer.')
    else:
        raise TypeError('The samples_number must be an integer.')

    is_input_ok = True
    return is_input_ok
def _validate_input_discretiser(
        dataset: np.ndarray,
        categorical_indices: Optional[List[Index]] = None,
        feature_names: Optional[List[str]] = None) -> bool:
    """
    Validates the input parameters of an arbitrary discretiser class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be discretised.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    feature_names : List[strings], optional (default=None)
        A list of feature names in order they appear in the ``dataset`` array.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array.
    IndexError
        Some of the column indices given in the ``categorical_indices`` list
        are invalid for the input ``dataset``.
    TypeError
        The ``dataset`` is not of a base (numerical and/or string) type.
        The ``categorical_indices`` is neither a Python list nor ``None``.
        The ``feature_names`` is neither a Python list nor ``None`` or one of
        its elements (if it is a list) is not a string.
    ValueError
        The length of the ``feature_names`` list is different than the number
        of columns (features) in the input ``dataset``.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(
                                     invalid_indices.tolist()))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if feature_names is not None:
        if isinstance(feature_names, list):
            if fuav.is_structured_array(dataset):
                features_number = len(dataset.dtype.names)
            else:
                features_number = dataset.shape[1]
            if len(feature_names) != features_number:
                raise ValueError('The length of feature_names list must be '
                                 'equal to the number of features (columns) '
                                 'in the input dataset.')

            for name in feature_names:
                if not isinstance(name, str):
                    raise TypeError('All of the feature_names must be '
                                    'strings. The *{}* feature name is not a '
                                    'string.'.format(name))
        else:
            raise TypeError('The feature_names parameter must be a Python '
                            'list or None.')

    is_valid = True
    return is_valid
def test_get_invalid_indices():
    """
    Tests :func:`fatf.utils.array.tools.get_invalid_indices` function.
    """
    type_error = 'Input arrays should be numpy array-like objects.'
    incorrect_shape_array = 'The input array should be 2-dimensional.'
    incorrect_shape_indices = 'The indices array should be 1-dimensional.'
    with pytest.raises(TypeError) as exin:
        fuat.get_invalid_indices(None, np.ones((4, )))
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.get_invalid_indices(None, np.ones((4, 4)))
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.get_invalid_indices(np.ones((4, )), None)
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.get_invalid_indices(None, np.ones((4, 4)))
    assert str(exin.value) == type_error
    # Incorrect shape array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.get_invalid_indices(np.ones((5, )), np.ones((4, 4)))
    assert str(exin.value) == incorrect_shape_array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.get_invalid_indices(np.ones((5, )), np.ones((4, )))
    assert str(exin.value) == incorrect_shape_array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.get_invalid_indices(np.ones((5, 3)), np.ones((4, 4)))
    assert str(exin.value) == incorrect_shape_indices

    gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array([0, 2]))
    assert np.array_equal(gind, np.array([2]))
    gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array(['a', 1]))
    assert np.array_equal(gind, np.array(['1', 'a']))
    gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array([1, 0]))
    assert np.array_equal(gind, np.array([]))
    assert np.array_equal(gind, np.empty((0, )))
    #
    gind = fuat.get_invalid_indices(NOT_NUMERICAL_NP_ARRAY, np.array([0, 2]))
    assert np.array_equal(gind, np.array([2]))
    gind = fuat.get_invalid_indices(NOT_NUMERICAL_NP_ARRAY, np.array(['a', 1]))
    assert np.array_equal(gind, np.array(['1', 'a']))
    #
    gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY,
                                    np.array([0, 'numbers']))
    assert np.array_equal(gind, np.array(['0']))
    gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY, np.array([0]))
    assert np.array_equal(gind, np.array([0]))
    gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY,
                                    np.array(['complex', 'numbers']))
    assert np.array_equal(gind, np.array([]))
    #
    gind = fuat.get_invalid_indices(WIDE_STRUCTURED_ARRAY,
                                    np.array(['complex', 'numbers']))
    assert np.array_equal(gind, np.array([]))