def _validate_input_dc(data_set: np.ndarray, categorical_indices: Union[None, List[Index]], neighbours: int, distance_function: Union[None, DistanceFunction], normalise_scores: bool) -> bool: """ Validates ``DensityCheck`` class initialiser's input parameters. Parameters ---------- data_set : numpy.ndarray A 2-dimensional numpy array (either classic or structured) of a base type. categorical_indices : Union[None, List[column index]], Either ``None`` or a list of column indices to be treated as categorical. neighbours : integer The number of closest neighbours to be considered. distance_function : Union[None, Callable[[data row, data row], number]] Either ``None`` or a Python function that calculates a distance between two data points. This function takes as an input two 1-dimensional numpy arrays (for classic numpy arrays) or numpy voids (fro structured numpy arrays) of equal length and outputs a number representing a distance between them. **The distance function is assumed to return the same distance regardless of the order in which the input parameters are given.** normalise_scores : boolean A boolean parameter indicating whether to normalise the scores (``True``) or not (``False``). Raises ------ AttributeError The distance function does not require exactly 2 non-optional parameters. IncorrectShapeError The ``data_set`` array is not 2-dimensional. IndexError Some of the provided categorical column indices are invalid for the ``data_set`` array. TypeError The ``data_set`` array is not of a base type (strings and/or numbers). The ``neighbours`` parameter is not an integer. The ``distance_function`` is neither ``None`` nor Python callable (a function). The ``normalise_scores`` parameter is not a boolean. The ``categorical_indices`` parameter is not a Python list. ValueError The ``neighbours`` parameter is smaller than 1 or larger than the number of instances (rows) in the ``data_set`` array. Returns ------- is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-branches is_valid = False if not fuav.is_2d_array(data_set): raise IncorrectShapeError('The data set should be a 2-dimensional ' 'numpy array.') if not fuav.is_base_array(data_set): raise TypeError('The data set is not of a base type (numbers and/or ' 'strings.') if categorical_indices is not None: if isinstance(categorical_indices, list): invalid_indices = fuat.get_invalid_indices( data_set, np.asarray(categorical_indices)).tolist() if invalid_indices: raise IndexError('The following indices are invalid for the ' 'input data set: {}.'.format(invalid_indices)) else: raise TypeError('The categorical_indices parameter must be a ' 'Python list or None.') if isinstance(neighbours, int): if neighbours < 1 or neighbours > data_set.shape[0]: raise ValueError('The neighbours number parameter has to be ' 'between 1 and number of data points (rows) in ' 'the data set array.') else: raise TypeError('The neighbours number parameter has to be an ' 'integer.') if distance_function is not None: if callable(distance_function): required_param_n = 0 params = inspect.signature(distance_function).parameters for param in params: if params[param].default is params[param].empty: required_param_n += 1 if required_param_n != 2: raise AttributeError('The distance function must require ' 'exactly 2 parameters. Given function ' 'requires {} ' 'parameters.'.format(required_param_n)) else: raise TypeError('The distance function should be a Python ' '(function).') if not isinstance(normalise_scores, bool): raise TypeError('The normalise scores parameter should be a boolean.') is_valid = True return is_valid
def _validate_input(dataset: np.ndarray, ground_truth: Optional[np.ndarray] = None, categorical_indices: Optional[List[Index]] = None, int_to_float: bool = True) -> bool: """ Validates the input parameters of an arbitrary augmentation class. Parameters ---------- dataset : numpy.ndarray A 2-dimensional numpy array with a dataset to be used for sampling. ground_truth : numpy.ndarray, optional (default=None) A 1-dimensional numpy array with labels for the supplied dataset. categorical_indices : List[column indices], optional (default=None) A list of column indices that should be treat as categorical features. int_to_float : boolean, optional (default=True) If ``True``, all of the integer dtype columns in the ``dataset`` will be generalised to ``numpy.float64`` type. Otherwise, integer type columns will remain integer and floating point type columns will remain floating point. Raises ------ IncorrectShapeError The input ``dataset`` is not a 2-dimensional numpy array. The ``ground_truth`` array is not a 1-dimensional numpy array. The number of ground truth annotation is different than the number of rows in the data array. IndexError Some of the column indices given in the ``categorical_indices`` parameter are not valid for the input ``dataset``. TypeError The ``categorical_indices`` parameter is neither a list nor ``None``. The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not of base (numerical and/or string) type. The ``int_to_float`` parameter is not a boolean. Returns ------- is_valid : boolean ``True`` if input is valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a ' '2-dimensional numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type.') if ground_truth is not None: if not fuav.is_1d_array(ground_truth): raise IncorrectShapeError('The ground_truth array must be ' '1-dimensional. (Or None if it is not ' 'required.)') if not fuav.is_base_array(ground_truth): raise TypeError('The ground_truth array must be of a base type.') if ground_truth.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of labels in the ' 'ground_truth array is not equal to the ' 'number of data points in the dataset ' 'array.') if categorical_indices is not None: if isinstance(categorical_indices, list): invalid_indices = fuat.get_invalid_indices( dataset, np.asarray(categorical_indices)) if invalid_indices.size: raise IndexError('The following indices are invalid for the ' 'input dataset: {}.'.format(invalid_indices)) else: raise TypeError('The categorical_indices parameter must be a ' 'Python list or None.') if not isinstance(int_to_float, bool): raise TypeError('The int_to_float parameter has to be a boolean.') is_valid = True return is_valid
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray, protected_features: List[Index]) -> np.ndarray: """ Checks for systemic bias in a dataset. This function checks whether there exist data points that share the same unprotected features but differ in protected features. For all of these instances their label (ground truth) will be checked and if it is different, a particular data points pair will be indicated to be biased. This dependency is represented as a boolean, square numpy array that shows whether systemic bias exists (``True``) for any pair of data points. Parameters ---------- dataset : numpy.ndarray A dataset to be evaluated for systemic bias. ground_truth : numpy.ndarray The labels corresponding to the dataset. protected_features : List[column index] A list of column indices in the dataset that hold protected attributes. Raises ------ IncorrectShapeError The dataset is not a 2-dimensional numpy array, the ground truth is not a 1-dimensional numpy array or the number of rows in the dataset is not equal to the number of elements in the ground truth array. IndexError Some of the column indices given in the ``protected_features`` list are not valid for the input dataset. TypeError The ``protected_features`` parameter is not a list. ValueError There are duplicate values in the protected feature indices list. Returns ------- systemic_bias_matrix : numpy.ndarray A square, diagonally symmetrical and boolean numpy array that indicates which pair of data point share the same unprotected features but differ in protected features and the ground truth annotation. """ # pylint: disable=too-many-branches if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The dataset should be a 2-dimensional ' 'numpy array.') if not fuav.is_1d_array(ground_truth): raise IncorrectShapeError('The ground truth should be a 1-dimensional ' 'numpy array.') if ground_truth.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of rows in the dataset and the ' 'ground truth should be equal.') if isinstance(protected_features, list): pfa = np.asarray(protected_features) if not fuat.are_indices_valid(dataset, pfa): iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist() raise IndexError('The following protected feature indices are not ' 'valid for the dataset array: {}.'.format(iid)) if len(set(protected_features)) != len(protected_features): raise ValueError('Some of the protected indices are duplicated.') else: raise TypeError('The protected_features parameter should be a list.') is_structured = fuav.is_structured_array(dataset) if is_structured: unprotected_features_array = recfn.drop_fields(dataset, protected_features) if unprotected_features_array is None: unprotected_features_array = np.ones((dataset.shape[0], ), dtype=[('ones', int)]) else: unprotected_features_array = np.delete(dataset, protected_features, axis=1) if not unprotected_features_array.size: unprotected_features_array = np.ones((dataset.shape[0], 1)) assert unprotected_features_array.shape[0] == dataset.shape[0], \ 'Must share rows number.' systemic_bias_columns = [] for i in range(unprotected_features_array.shape[0]): if is_structured: equal_unprotected = ( unprotected_features_array == unprotected_features_array[i]) else: equal_unprotected = np.apply_along_axis( np.array_equal, 1, unprotected_features_array, unprotected_features_array[i, :]) equal_unprotected_indices = np.where(equal_unprotected) # Check whether the ground truth is different for these rows equal_unprotected[equal_unprotected_indices] = ( ground_truth[i] != ground_truth[equal_unprotected_indices]) systemic_bias_columns.append(equal_unprotected) systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1) assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \ 'The matrix has to be diagonally symmetric.' assert not np.diagonal(systemic_bias_matrix).any(), \ 'Same elements cannot be systemically biased.' return systemic_bias_matrix
def _validate_input_local_fidelity( dataset: np.ndarray, data_row: Union[np.ndarray, np.void], global_predictive_function: PredictiveFunctionType, local_predictive_function: PredictiveFunctionType, metric_function: Callable[[np.ndarray, np.ndarray], float], explained_class_index: Union[int, None], explained_feature_indices: Union[List[IndexType], None], fidelity_radius_percentage: int, samples_number: int) -> bool: """ Validates the input parameters for the ``local_fidelity_score`` function. This function validates input parameter of the :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score` function. The description of this function's input parameters, errors and exceptions can be found therein. Returns ------- is_input_ok : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-arguments,too-many-branches,too-many-statements is_input_ok = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a ' '2-dimensional numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type -- numbers ' 'and/or strings.') if not fuav.is_1d_like(data_row): raise IncorrectShapeError('The data_row must either be a ' '1-dimensional numpy array or a numpy ' 'void object for structured data rows.') are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row])) if not are_similar: raise TypeError('The dtype of the data_row is too different from ' 'the dtype of the dataset array.') # If the dataset is structured and the data_row has a different # number of features this will be caught by the above dtype check. # For classic numpy arrays this has to be done separately. if not fuav.is_structured_array(dataset): if dataset.shape[1] != data_row.shape[0]: raise IncorrectShapeError('The data_row must contain the same ' 'number of features as the dataset.') if callable(global_predictive_function): global_params_n = fuv.get_required_parameters_number( global_predictive_function) if global_params_n != 1: raise IncompatibleModelError( 'The global predictive function must have exactly *one* ' 'required parameter to work with this metric.') else: raise TypeError('The global_predictive_function should be a Python ' 'callable, e.g., a Python function.') if callable(local_predictive_function): local_params_n = fuv.get_required_parameters_number( local_predictive_function) if local_params_n != 1: raise IncompatibleModelError( 'The local predictive function must have exactly *one* ' 'required parameter to work with this metric.') else: raise TypeError('The local_predictive_function should be a Python ' 'callable, e.g., a Python function.') if callable(metric_function): if fuv.get_required_parameters_number(metric_function) != 2: raise TypeError('The metric_function must take exactly *two* ' 'required parameters.') else: raise TypeError('The metric_function should be a Python callable, ' 'e.g., a Python function.') # Explained class index global_prediction = global_predictive_function(dataset[:1]) assert not fuav.is_structured_array(global_prediction), 'Must be plain.' assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.' if fuav.is_2d_array(global_prediction): # A probabilistic model. if explained_class_index is not None: if isinstance(explained_class_index, int): if (explained_class_index >= global_prediction.shape[1] or explained_class_index < 0): raise ValueError('The explained_class_index parameter is ' 'negative or larger than the number of ' 'classes output by the global ' 'probabilistic model.') else: raise TypeError('For probabilistic global models, i.e., ' 'global predictive functions, the ' 'explained_class_index parameter has to be an ' 'integer or None.') elif fuav.is_1d_array(global_prediction): if explained_class_index is not None: warnings.warn( 'The explained_class_index parameter is not None and will be ' 'ignored since the global model is not probabilistic.', UserWarning) else: assert False, ('Global predictor must output a 1- or 2-dimensional ' 'numpy array.') # pragma: nocover if explained_feature_indices is not None: if isinstance(explained_feature_indices, list): invalid_indices = fuat.get_invalid_indices( dataset, np.asarray(explained_feature_indices)) if invalid_indices.size: raise IndexError( 'The following column indices are invalid for the input ' 'dataset: {}.'.format(invalid_indices)) else: raise TypeError('The explained_feature_indices parameter must be ' 'a Python list or None.') if isinstance(fidelity_radius_percentage, int): if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100: raise ValueError('The fidelity_radius_percentage must be an ' 'integer between 1 and 100.') else: raise TypeError('The fidelity_radius_percentage must be an integer ' 'between 1 and 100.') if isinstance(samples_number, int): if samples_number < 1: raise ValueError('The samples_number must be a positive integer.') else: raise TypeError('The samples_number must be an integer.') is_input_ok = True return is_input_ok
def _validate_input_discretiser( dataset: np.ndarray, categorical_indices: Optional[List[Index]] = None, feature_names: Optional[List[str]] = None) -> bool: """ Validates the input parameters of an arbitrary discretiser class. Parameters ---------- dataset : numpy.ndarray A 2-dimensional numpy array with a dataset to be discretised. categorical_indices : List[column indices], optional (default=None) A list of column indices that should be treat as categorical features. feature_names : List[strings], optional (default=None) A list of feature names in order they appear in the ``dataset`` array. Raises ------ IncorrectShapeError The input ``dataset`` is not a 2-dimensional numpy array. IndexError Some of the column indices given in the ``categorical_indices`` list are invalid for the input ``dataset``. TypeError The ``dataset`` is not of a base (numerical and/or string) type. The ``categorical_indices`` is neither a Python list nor ``None``. The ``feature_names`` is neither a Python list nor ``None`` or one of its elements (if it is a list) is not a string. ValueError The length of the ``feature_names`` list is different than the number of columns (features) in the input ``dataset``. Returns ------- is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-branches is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a ' '2-dimensional numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type.') if categorical_indices is not None: if isinstance(categorical_indices, list): invalid_indices = fuat.get_invalid_indices( dataset, np.asarray(categorical_indices)) if invalid_indices.size: raise IndexError('The following indices are invalid for the ' 'input dataset: {}.'.format( invalid_indices.tolist())) else: raise TypeError('The categorical_indices parameter must be a ' 'Python list or None.') if feature_names is not None: if isinstance(feature_names, list): if fuav.is_structured_array(dataset): features_number = len(dataset.dtype.names) else: features_number = dataset.shape[1] if len(feature_names) != features_number: raise ValueError('The length of feature_names list must be ' 'equal to the number of features (columns) ' 'in the input dataset.') for name in feature_names: if not isinstance(name, str): raise TypeError('All of the feature_names must be ' 'strings. The *{}* feature name is not a ' 'string.'.format(name)) else: raise TypeError('The feature_names parameter must be a Python ' 'list or None.') is_valid = True return is_valid
def test_get_invalid_indices(): """ Tests :func:`fatf.utils.array.tools.get_invalid_indices` function. """ type_error = 'Input arrays should be numpy array-like objects.' incorrect_shape_array = 'The input array should be 2-dimensional.' incorrect_shape_indices = 'The indices array should be 1-dimensional.' with pytest.raises(TypeError) as exin: fuat.get_invalid_indices(None, np.ones((4, ))) assert str(exin.value) == type_error with pytest.raises(TypeError) as exin: fuat.get_invalid_indices(None, np.ones((4, 4))) assert str(exin.value) == type_error with pytest.raises(TypeError) as exin: fuat.get_invalid_indices(np.ones((4, )), None) assert str(exin.value) == type_error with pytest.raises(TypeError) as exin: fuat.get_invalid_indices(None, np.ones((4, 4))) assert str(exin.value) == type_error # Incorrect shape array with pytest.raises(IncorrectShapeError) as exin: fuat.get_invalid_indices(np.ones((5, )), np.ones((4, 4))) assert str(exin.value) == incorrect_shape_array with pytest.raises(IncorrectShapeError) as exin: fuat.get_invalid_indices(np.ones((5, )), np.ones((4, ))) assert str(exin.value) == incorrect_shape_array with pytest.raises(IncorrectShapeError) as exin: fuat.get_invalid_indices(np.ones((5, 3)), np.ones((4, 4))) assert str(exin.value) == incorrect_shape_indices gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array([0, 2])) assert np.array_equal(gind, np.array([2])) gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array(['a', 1])) assert np.array_equal(gind, np.array(['1', 'a'])) gind = fuat.get_invalid_indices(NUMERICAL_NP_ARRAY, np.array([1, 0])) assert np.array_equal(gind, np.array([])) assert np.array_equal(gind, np.empty((0, ))) # gind = fuat.get_invalid_indices(NOT_NUMERICAL_NP_ARRAY, np.array([0, 2])) assert np.array_equal(gind, np.array([2])) gind = fuat.get_invalid_indices(NOT_NUMERICAL_NP_ARRAY, np.array(['a', 1])) assert np.array_equal(gind, np.array(['1', 'a'])) # gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY, np.array([0, 'numbers'])) assert np.array_equal(gind, np.array(['0'])) gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY, np.array([0])) assert np.array_equal(gind, np.array([0])) gind = fuat.get_invalid_indices(NUMERICAL_STRUCTURED_ARRAY, np.array(['complex', 'numbers'])) assert np.array_equal(gind, np.array([])) # gind = fuat.get_invalid_indices(WIDE_STRUCTURED_ARRAY, np.array(['complex', 'numbers'])) assert np.array_equal(gind, np.array([]))