def hamming_array_distance(X: np.ndarray, Y: np.ndarray, **kwargs: bool) -> np.ndarray: """ Calculates the Hamming distance matrix between rows in ``X`` and ``Y``. Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the same width. Parameters ---------- X : numpy.ndarray A numpy array -- has to be 2-dimensional and non-numerical. Y : numpy.ndarray A numpy array -- has to be 2-dimensional and non-numerical. **kwargs : boolean Keyword arguments that are passed to the :func:`fatf.utils.distances.hamming_distance_base` function responsible for calculating the Hamming distance. Raises ------ IncorrectShapeError Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not have the same number of columns. ValueError Either of the input arrays is not purely textual. Returns ------- distance_matrix : numpy.ndarray An matrix of Hamming distances between rows in ``X` and ``Y``. """ # pylint: disable=invalid-name if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') if not fuav.is_2d_array(Y): raise IncorrectShapeError('The Y array should be 2-dimensional.') if not fuav.is_textual_array(X): raise ValueError('The X array should be textual.') if not fuav.is_textual_array(Y): raise ValueError('The Y array should be textual.') # Transform the arrays to unstructured X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name Y_array = fuat.as_unstructured(Y) # pylint: disable=invalid-name # Compare shapes if X_array.shape[1] != Y_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of columns ' 'in Y array.') distance_matrix = np.apply_along_axis(hamming_point_distance, 1, X_array, Y_array, **kwargs) return distance_matrix
def binary_array_distance(X: np.ndarray, Y: np.ndarray, **kwargs: bool) -> np.ndarray: """ Calculates the binary distance matrix between rows in ``X`` and ``Y``. Both ``X`` and ``Y`` have to be 2-dimensional numpy arrays of the same width. Either of the input arrays is not of a base dtype. (See :func:`fatf.utils.array.validation.is_base_array` function description for the explanation of a base dtype.) Parameters ---------- X : numpy.ndarray A numpy array -- has to be 2-dimensional. Y : numpy.ndarray A numpy array -- has to be 2-dimensional. **kwargs : boolean Keyword arguments that are passed to the :func:`fatf.utils.distances.binary_distance` function responsible for calculating the binary distance. Raises ------ IncorrectShapeError Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not have the same number of columns. Returns ------- distance_matrix : numpy.ndarray An matrix of binary distances between rows in ``X` and ``Y``. """ # pylint: disable=invalid-name if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') if not fuav.is_2d_array(Y): raise IncorrectShapeError('The Y array should be 2-dimensional.') # Transform the arrays to unstructured X_array = fuat.as_unstructured(X) Y_array = fuat.as_unstructured(Y) # Compare shapes if X_array.shape[1] != Y_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of columns ' 'in Y array.') distance_matrix = np.apply_along_axis(binary_point_distance, 1, X_array, Y_array, **kwargs) return distance_matrix
def euclidean_array_distance(X: np.ndarray, Y: np.ndarray) -> np.ndarray: """ Calculates the Euclidean distance matrix between rows in ``X`` and ``Y``. Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the same width. Parameters ---------- X : numpy.ndarray A numpy array -- has to be 2-dimensional and purely numerical. Y : numpy.ndarray A numpy array -- has to be 2-dimensional and purely numerical. Raises ------ IncorrectShapeError Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not have the same number of columns. ValueError Either of the input arrays is not purely numerical. Returns ------- distance_matrix : numpy.ndarray An matrix of Euclidean distances between rows in ``X` and ``Y``. """ # pylint: disable=invalid-name if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') if not fuav.is_2d_array(Y): raise IncorrectShapeError('The Y array should be 2-dimensional.') if not fuav.is_numerical_array(X): raise ValueError('The X array should be purely numerical.') if not fuav.is_numerical_array(Y): raise ValueError('The Y array should be purely numerical.') # Transform the arrays to unstructured Y_array = fuat.as_unstructured(Y) # pylint: disable=invalid-name X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name # Compare shapes if Y_array.shape[1] != X_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of columns ' 'in Y array.') distance_matrix = np.apply_along_axis(euclidean_point_distance, 1, X_array, Y_array) return distance_matrix
def hamming_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray, **kwargs: bool) -> np.ndarray: """ Calculates the Hamming distance between ``y`` and every row of ``X``. ``y`` has to be a 1-dimensional numerical numpy array or a row of a structured numpy array (i.e. numpy's void) and ``X`` has to be a 2-dimensional numerical numpy array. The length of ``y`` has to be the same as the width of ``X``. Parameters ---------- y : Union[numpy.ndarray, numpy.void] A numpy array (has to be 1-dimensional and non-numerical) used to calculate the distances from. X : numpy.ndarray A numpy array (has to be 2-dimensional and non-numerical) to which rows the distances are calculated. **kwargs : boolean Keyword arguments that are passed to the :func:`fatf.utils.distances.hamming_distance_base` function responsible for calculating the Hamming distance. Raises ------ IncorrectShapeError Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the length of ``y`` is not equal to the number of columns in ``X``. ValueError Either of the input arrays is not purely textual. Returns ------- distances : numpy.ndarray An array of Hamming distances between ``y`` and every row of ``X``. """ # pylint: disable=invalid-name if not fuav.is_1d_like(y): raise IncorrectShapeError('The y array should be 1-dimensional.') if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') # Transform the arrays to unstructured y_array = fuat.as_unstructured(y) X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name if not fuav.is_textual_array(y_array): raise ValueError('The y array should be textual.') if not fuav.is_textual_array(X_array): raise ValueError('The X array should be textual.') # Compare shapes if y_array.shape[0] != X_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of elements ' 'in the y array.') distances = np.apply_along_axis(hamming_distance, 1, X_array, y_array, **kwargs) return distances
def _validate_data_header(X: np.ndarray, y: np.ndarray, n_samples: int, n_features: int, y_names: np.ndarray) -> bool: """ Checks if reading in data is consistent by ... the csv header. For details on valid header formatting see the :func:`fatf.utils.datasets.load_data` documentation. Parameters ---------- X : numpy.ndarray Array read in from ``numpy.genfromtxt``. y : numpy.ndarray Target variable indicating which class each sample in ``X`` belongs to. n_samples : integer Number of samples expected in ``X`` and ``y``. n_features : integer Number of features expected in ``X``. y_names : numpy.ndarray Unique class names of the target variable ``y``. Raises ------ ValueError The number of samples in ``X`` and ``y`` or the number of features in the dataset ``X`` is not consistent with the header. Also, raised when the number of unique classes in ``y`` is not consistent with the header. Returns ------- is_consistent : boolean True if the header is consistent with the data, False otherwise. """ # pylint: disable=invalid-name assert fuav.is_2d_array(X), 'X has to be a 2-dimensional array.' assert fuav.is_1d_array(y), 'y has to be a 1-dimensional array.' assert fuav.is_1d_array(y_names), 'y_names must be a 1-dimensional array.' is_consistent = False if X.shape[0] != n_samples: raise ValueError('The number of samples in the dataset is not ' 'consistent with the header.') # Use len(X[0]) in case X is structured array. if len(X[0]) != n_features: raise ValueError('The number of features in the dataset is not ' 'consistent with the header.') if y.shape[0] != n_samples: raise ValueError('The number of labels (target variables) is not ' 'consistent with the header.') if y_names.shape[0]: if y_names.shape[0] != np.unique(y).shape[0]: raise ValueError('The number of classes is not consistent with ' 'the header.') is_consistent = True return is_consistent
def structured_to_unstructured( structured_array: np.ndarray, **kwargs: Optional[np.dtype]) -> np.ndarray: # pragma: no cover """ Calls either local or numpy's structured_to_unstructured function. numpy 1.16.0 has introduced :func:`numpy.lib.recfunctions.structured_to_unstructured` function. To ensure backwards compatibility up to numpy 1.9.0 this package implements its own version of this function (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`). This function calls the latter if numpy version below 1.16.0 is installed. However, if numpy 1.16.0 or above is detected, numpy's implementation is used instead. For the description of ``structured_to_unstructured`` functionality either refer to the corresponding numpy (:func:`numpy.lib.recfunctions.structured_to_unstructured`) or local (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`) documentation. .. warning:: Since this function either calls a local implementation or a builtin numpy function there may be some inconsistencies in its behaviour. One that we are aware of is conversion of arrays that contain ``'V'`` -- raw data (void), ``'O'`` -- (Python) objects, ``'M'`` -- datetime or ``'m'`` -- timedelta dtypes. These types are not supported by the local implementation, however some of them are supported by the numpy built-in, e.g. the ``'V'`` type. Parameters ---------- structured_array : numpy.ndarray A structured numpy array to be converted into a plane numpy array. **kwargs : Optional[numpy.dtype] Named parameters that are passed to the appropriate structured to unstructured array converter. These parameters are ignored when calling the local implementation (:func:`fatf.utils.array.tools.fatf_structured_to_unstructured`). Returns ------- classic_array : numpy.ndarray A classic numpy array representation of the ``structured_array`` with the most generic type out of the input array's dtypes. """ # pylint: disable=no-member if _LOCAL_STRUCTURED_TO_UNSTRUCTURED: classic_array = fatf_structured_to_unstructured(structured_array) else: classic_array = recfn.structured_to_unstructured( structured_array, **kwargs) if (fuav.is_2d_array(structured_array) and fuav.is_1d_array(classic_array)): classic_array = classic_array.reshape( (structured_array.shape[0], 1)) return classic_array
def euclidean_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray) -> np.ndarray: """ Calculates the Euclidean distance between ``y`` and every row of ``X``. ``y`` has to be a 1-dimensional numerical numpy array or a row of a structured numpy array (i.e. numpy's void) and ``X`` has to be a 2-dimensional numerical numpy array. The length of ``y`` has to be the same as the width of ``X``. Parameters ---------- y : Union[numpy.ndarray, numpy.void] A numpy array (has to be 1-dimensional and purely numerical) used to calculate distances from. X : numpy.ndarray A numpy array (has to be 2-dimensional and purely numerical) to which rows distances are calculated. Raises ------ IncorrectShapeError Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the length of ``y`` is not equal to the number of columns in ``X``. ValueError Either of the input arrays is not purely numerical. Returns ------- distances : numpy.ndarray An array of Euclidean distances between ``y`` and every row of ``X``. """ # pylint: disable=invalid-name if not fuav.is_1d_like(y): raise IncorrectShapeError('The y array should be 1-dimensional.') if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') # Transform the arrays to unstructured y_array = fuat.as_unstructured(y) X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name if not fuav.is_numerical_array(y_array): raise ValueError('The y array should be purely numerical.') if not fuav.is_numerical_array(X_array): raise ValueError('The X array should be purely numerical.') # Compare shapes if y_array.shape[0] != X_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of elements ' 'in the y array.') distances = np.apply_along_axis(euclidean_distance, 1, X_array, y_array) return distances
def _validate_get_distance( data_array: np.ndarray, distance_function: Callable[[np.ndarray, np.ndarray], float]) -> bool: """ Validates ``data_array`` and ``distance_function`` parameters. Parameters ---------- data_array : numpy.ndarray A 2-dimensional numpy array. distance_function : Callable[[numpy.ndarray, numpy.ndarray], number] A Python function that takes as an input two 1-dimensional numpy arrays of equal length and outputs a number representing a distance between them. Raises ------ AttributeError The distance function does not require exactly two parameters. IncorrectShapeError The data array is not a 2-dimensional numpy array. TypeError The data array is not of a base type (numbers and/or strings). The distance function is not a Python callable (function). Returns ------- is_valid : boolean ``True`` if the parameters are valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(data_array): raise IncorrectShapeError('The data_array has to be a 2-dimensional ' '(structured or unstructured) numpy array.') if not fuav.is_base_array(data_array): raise TypeError('The data_array has to be of a base type (strings ' 'and/or numbers).') if callable(distance_function): required_param_n = 0 params = inspect.signature(distance_function).parameters for param in params: if params[param].default is params[param].empty: required_param_n += 1 if required_param_n != 2: raise AttributeError('The distance function must require exactly ' '2 parameters. Given function requires {} ' 'parameters.'.format(required_param_n)) else: raise TypeError('The distance function should be a Python callable ' '(function).') is_valid = True return is_valid
def validate_binary_matrix(binary_array: np.ndarray, name: Optional[str] = None) -> bool: """ Validates a binary, square and symmetric numpy array. Parameters ---------- binary_array : numpy.ndarray A square (equal number of rows and columns), boolean symmetric numpy array. Raises ------ IncorrectShapeError The matrix is not 2-dimensional or square. TypeError The matrix is not of boolean type. ValueError The matrix is a structured numpy array or is not diagonally symmetric. Returns ------- is_valid : boolean ``True`` if the matrix is valid, ``False`` otherwise. """ if name is None: name = '' else: assert isinstance(name, str), 'The name parameter has to be string.' name = name.strip() name = '{} '.format(name) if name else name is_valid = False if not fuav.is_2d_array(binary_array): raise IncorrectShapeError('The {}matrix has to be ' '2-dimensional.'.format(name)) if fuav.is_structured_array(binary_array): raise ValueError('The {}matrix cannot be a structured numpy ' 'array.'.format(name)) if binary_array.dtype != bool: raise TypeError('The {}matrix has to be of boolean ' 'type.'.format(name)) if binary_array.shape[0] != binary_array.shape[1]: raise IncorrectShapeError('The {}matrix has to be ' 'square.'.format(name)) if (not np.array_equal(binary_array, binary_array.T) or np.diagonal(binary_array).any()): raise ValueError('The {}matrix has to be diagonally ' 'symmetric.'.format(name)) is_valid = True return is_valid
def _input_is_valid(dataset: np.ndarray, model: object, feature_index: Union[int, str], treat_as_categorical: Optional[bool], steps_number: Optional[int]) -> bool: # yapf: disable """ Validates input parameters of Individual Conditional Expectation function. For the input parameter description, warnings and exceptions please see the documentation of the :func`fatf.transparency.model.feature_influence. individual_conditional_expectation` function. Returns ------- is_input_ok : boolean ``True`` if the input is valid, ``False`` otherwise. """ is_input_ok = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a 2-dimensional ' 'array.') if not fuav.is_base_array(dataset): raise ValueError('The input dataset must only contain base types ' '(textual and numerical).') if not fumv.check_model_functionality(model, require_probabilities=True): raise IncompatibleModelError('This functionality requires the model ' 'to be capable of outputting ' 'probabilities via predict_proba method.') if not fuat.are_indices_valid(dataset, np.array([feature_index])): raise IndexError('Provided feature index is not valid for the input ' 'dataset.') if isinstance(steps_number, int): if steps_number < 2: raise ValueError('steps_number has to be at least 2.') elif steps_number is None: pass else: raise TypeError('steps_number parameter has to either be None or an ' 'integer.') if (not isinstance(treat_as_categorical, bool) and treat_as_categorical is not None): raise TypeError('treat_as_categorical has to either be None or a ' 'boolean.') is_input_ok = True return is_input_ok
def _validate_input_drm(dataset: np.ndarray, data_row: Union[np.ndarray, np.void]) -> bool: """ Validates :func:`fatf.utils.data.transformation.dataset_row_masking` input. This function checks if ``dataset`` is a 2-dimensional array and if ``data_row`` is a 1-dimensional array of the same length as the number of columns in the ``dataset``. It also checks if they have valid and compatible dtypes. For the description of input parameters, and warnings and exceptions raised by this function please see the documentation of the :func:`fatf.utils.data.transformation.dataset_row_masking` function. Returns ------- is_valid : boolean ``True`` if input is valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a 2-dimensional ' 'numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type -- text, ' 'numbers or mixture of the two.') if not fuav.is_1d_like(data_row): raise IncorrectShapeError('The data row must either be a ' '1-dimensional numpy array or a numpy void ' 'object for structured rows.') # For structured arrays the dtype check also checks the number of columns are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]), strict_comparison=False) if not are_similar: raise TypeError('The dtype of the data row is too different from the ' 'dtype of the dataset provided.') # Since the types agree both, the row and the data set, have to be # structured or plane if not fuav.is_structured_array(dataset): if dataset.shape[1] != data_row.shape[0]: raise IncorrectShapeError('The data row must contain the same ' 'number of elements as the number of ' 'columns in the provided dataset.') is_valid = True return is_valid
def _validate_input(dataset: np.ndarray, explain_instance: Callable, sample_size: int, explanations_number: int) -> bool: """ Validates input for submodular pick. For the input parameters description, warnings and exceptions please see the documentation of the :func:`fatf.transparency.models.submodular_pick` function. Returns ------- is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError( 'The input data set must be a 2-dimensional array.') if not fuav.is_base_array(dataset): raise ValueError('The input data set must only contain base types ' '(strings and numbers).') if not isinstance(sample_size, int): raise TypeError('sample_size must be an integer.') if sample_size < 0: raise ValueError('sample_size must be a non-negative integer.') if not isinstance(explanations_number, int): raise TypeError('explanations_number must be an integer.') if explanations_number is not None and explanations_number < 0: raise ValueError('explanations_number must be a non-negative integer.') if (sample_size and explanations_number and sample_size < explanations_number): raise ValueError('The number of explanations cannot be larger than ' 'the number of samples.') if callable(explain_instance): params_n = fuv.get_required_parameters_number(explain_instance) if params_n != 1: raise RuntimeError('The explain_instance callable must accept ' 'exactly one required parameter.') else: raise TypeError('The explain_instance should be a Python callable ' '(function or method).') is_valid = True return is_valid
def get_invalid_indices(array: np.ndarray, indices: np.ndarray) -> np.ndarray: """ Returns a numpy array with column indices that the input array is missing. Parameters ---------- array : numpy.ndarray A 2-dimensional array to be checked. indices : numpy.ndarray A 1-dimensional array of indices corresponding to columns in the input array. Raises ------ TypeError Either of the input arrays is not a numpy array-like object. IncorrectShapeError The input array is not 2-dimensional or the indices arrays in not 1-dimensional. Returns ------- invalid_indices : numpy.ndarray A **sorted** array of indices that were not found in the input array. """ if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)): raise TypeError('Input arrays should be numpy array-like objects.') if not fuav.is_2d_array(array): raise IncorrectShapeError('The input array should be 2-dimensional.') if not fuav.is_1d_array(indices): raise IncorrectShapeError('The indices array should be 1-dimensional.') if fuav.is_structured_array(array): array_indices = set(array.dtype.names) else: array_indices = set(range(array.shape[1])) # Alternatively use numpy's np.isin (which supersedes np.in1d): # invalid_indices = indices[np.isin(indices, array_indices, invert=True)] # or np.setdiff1d: invalid_indices = np.setdiff1d(indices, array_indices) invalid_indices = set(indices.tolist()) - array_indices return np.sort(list(invalid_indices))
def _randomise_patch(self, mask: np.ndarray) -> np.ndarray: """ Generates a random colour for each segment selected to be occluded by the ``mask``. Parameters ---------- mask : numpy.ndarray A boolean numpy array of the same shape as ``segments``, indicating the pixels (``True``) for which a random colour patch should be generated. Returns ------- randomise_patch : numpy.ndarray A numpy array of (number of pixels to be occluded X number of colour channels) dimensions holding random colour patches for the segments selected to be occluded. """ assert fuav.is_2d_array(mask), 'Mask must 2-D numpy array.' assert mask.shape == self.segments.shape, 'Mask must be segments-like.' assert mask.dtype.kind == 'b', 'Mask must be binary.' randomise_patch = self.image.copy() unique_segments = np.unique(self.segments[mask]) for id_ in unique_segments: segment_mask = (self.segments == id_) if self.is_rgb: segment_colour = ( random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) ) # type: SegmentColour # yapf: disable else: if self.is_bnw: segment_colour = random.choice([0, 255]) else: segment_colour = random.randint(0, 255) randomise_patch[segment_mask] = segment_colour randomise_patch = randomise_patch[mask] return randomise_patch
def are_indices_valid(array: np.array, indices: np.array) -> bool: """ Checks whether all the input ``indices`` are valid for the input ``array``. Parameters ---------- array : numpy.array The 2-dimensional array to be checked. indices : numpy.array 1-dimensional array of column indices. Raises ------ TypeError Either of the input arrays is not a numpy array-like object. IncorrectShapeError The input array is not 2-dimensional or the indices arrays in not 1-dimensional. Returns ------- is_valid : boolean A Boolean variable that indicates whether the input column indices are valid indices for the input array. """ if not (isinstance(array, np.ndarray) and isinstance(indices, np.ndarray)): raise TypeError('Input arrays should be numpy array-like objects.') if not fuav.is_2d_array(array): raise IncorrectShapeError('The input array should be 2-dimensional.') if not fuav.is_1d_array(indices): raise IncorrectShapeError('The indices array should be 1-dimensional.') invalid_indices = get_invalid_indices(array, indices) assert fuav.is_1d_array(invalid_indices), 'This should be a 1-d array.' is_valid = not bool(invalid_indices.shape[0]) return is_valid
def _validate_input(dataset: np.ndarray, ground_truth: Optional[np.ndarray] = None, categorical_indices: Optional[List[Index]] = None, int_to_float: bool = True) -> bool: """ Validates the input parameters of an arbitrary augmentation class. Parameters ---------- dataset : numpy.ndarray A 2-dimensional numpy array with a dataset to be used for sampling. ground_truth : numpy.ndarray, optional (default=None) A 1-dimensional numpy array with labels for the supplied dataset. categorical_indices : List[column indices], optional (default=None) A list of column indices that should be treat as categorical features. int_to_float : boolean, optional (default=True) If ``True``, all of the integer dtype columns in the ``dataset`` will be generalised to ``numpy.float64`` type. Otherwise, integer type columns will remain integer and floating point type columns will remain floating point. Raises ------ IncorrectShapeError The input ``dataset`` is not a 2-dimensional numpy array. The ``ground_truth`` array is not a 1-dimensional numpy array. The number of ground truth annotation is different than the number of rows in the data array. IndexError Some of the column indices given in the ``categorical_indices`` parameter are not valid for the input ``dataset``. TypeError The ``categorical_indices`` parameter is neither a list nor ``None``. The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not of base (numerical and/or string) type. The ``int_to_float`` parameter is not a boolean. Returns ------- is_valid : boolean ``True`` if input is valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a ' '2-dimensional numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type.') if ground_truth is not None: if not fuav.is_1d_array(ground_truth): raise IncorrectShapeError('The ground_truth array must be ' '1-dimensional. (Or None if it is not ' 'required.)') if not fuav.is_base_array(ground_truth): raise TypeError('The ground_truth array must be of a base type.') if ground_truth.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of labels in the ' 'ground_truth array is not equal to the ' 'number of data points in the dataset ' 'array.') if categorical_indices is not None: if isinstance(categorical_indices, list): invalid_indices = fuat.get_invalid_indices( dataset, np.asarray(categorical_indices)) if invalid_indices.size: raise IndexError('The following indices are invalid for the ' 'input dataset: {}.'.format(invalid_indices)) else: raise TypeError('The categorical_indices parameter must be a ' 'Python list or None.') if not isinstance(int_to_float, bool): raise TypeError('The int_to_float parameter has to be a boolean.') is_valid = True return is_valid
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray, protected_features: List[Index]) -> np.ndarray: """ Checks for systemic bias in a dataset. This function checks whether there exist data points that share the same unprotected features but differ in protected features. For all of these instances their label (ground truth) will be checked and if it is different, a particular data points pair will be indicated to be biased. This dependency is represented as a boolean, square numpy array that shows whether systemic bias exists (``True``) for any pair of data points. Parameters ---------- dataset : numpy.ndarray A dataset to be evaluated for systemic bias. ground_truth : numpy.ndarray The labels corresponding to the dataset. protected_features : List[column index] A list of column indices in the dataset that hold protected attributes. Raises ------ IncorrectShapeError The dataset is not a 2-dimensional numpy array, the ground truth is not a 1-dimensional numpy array or the number of rows in the dataset is not equal to the number of elements in the ground truth array. IndexError Some of the column indices given in the ``protected_features`` list are not valid for the input dataset. TypeError The ``protected_features`` parameter is not a list. ValueError There are duplicate values in the protected feature indices list. Returns ------- systemic_bias_matrix : numpy.ndarray A square, diagonally symmetrical and boolean numpy array that indicates which pair of data point share the same unprotected features but differ in protected features and the ground truth annotation. """ # pylint: disable=too-many-branches if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The dataset should be a 2-dimensional ' 'numpy array.') if not fuav.is_1d_array(ground_truth): raise IncorrectShapeError('The ground truth should be a 1-dimensional ' 'numpy array.') if ground_truth.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of rows in the dataset and the ' 'ground truth should be equal.') if isinstance(protected_features, list): pfa = np.asarray(protected_features) if not fuat.are_indices_valid(dataset, pfa): iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist() raise IndexError('The following protected feature indices are not ' 'valid for the dataset array: {}.'.format(iid)) if len(set(protected_features)) != len(protected_features): raise ValueError('Some of the protected indices are duplicated.') else: raise TypeError('The protected_features parameter should be a list.') is_structured = fuav.is_structured_array(dataset) if is_structured: unprotected_features_array = recfn.drop_fields(dataset, protected_features) if unprotected_features_array is None: unprotected_features_array = np.ones((dataset.shape[0], ), dtype=[('ones', int)]) else: unprotected_features_array = np.delete(dataset, protected_features, axis=1) if not unprotected_features_array.size: unprotected_features_array = np.ones((dataset.shape[0], 1)) assert unprotected_features_array.shape[0] == dataset.shape[0], \ 'Must share rows number.' systemic_bias_columns = [] for i in range(unprotected_features_array.shape[0]): if is_structured: equal_unprotected = ( unprotected_features_array == unprotected_features_array[i]) else: equal_unprotected = np.apply_along_axis( np.array_equal, 1, unprotected_features_array, unprotected_features_array[i, :]) equal_unprotected_indices = np.where(equal_unprotected) # Check whether the ground truth is different for these rows equal_unprotected[equal_unprotected_indices] = ( ground_truth[i] != ground_truth[equal_unprotected_indices]) systemic_bias_columns.append(equal_unprotected) systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1) assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \ 'The matrix has to be diagonally symmetric.' assert not np.diagonal(systemic_bias_matrix).any(), \ 'Same elements cannot be systemically biased.' return systemic_bias_matrix
def batch_data(data: np.ndarray, batch_size: int = 50, transformation_fn: Callable = None) -> np.ndarray: """ Slices ``data`` into batches and returns then sequentially. .. versionadded:: 0.1.1 Since some data may be too large to fit into memory as whole, this function slices them into batches and yields them sequentially. If desired, each batch can be processed by ``transformation_fn`` prior to returning it. Parameters ---------- data : numpy.ndarray A two dimensional numpy array (either classic or structured) to be sliced into batches. batch_size : integer, optional (default=50) The size (number of rows) of each batch. transformation_fn : callable, optional (default=None) A callable object to apply to each batch before returning it. It must have exactly one required parameter. Raises ------ IncorrectShapeError The ``data`` array is not 2-dimensional. RuntimeError The transformation function does not have exactly one required parameter. TypeError The ``batch_size`` is not an integer or the ``transformation_fn`` is not a callable object. ValueError The ``batch_size`` is smaller than 1. Yields ------ slice : numpy.ndarray A slice of data. """ if not fuav.is_2d_array(data): raise IncorrectShapeError('The data array must be 2-dimensional.') if fuav.is_structured_array(data): slice_fn = lambda d, a, b: d[a:b] # noqa: E731 else: slice_fn = lambda d, a, b: d[a:b, :] # noqa: E731 if not isinstance(batch_size, int): raise TypeError('The batch size must be an integer.') if batch_size < 1: raise ValueError('The batch size must be larger than 0.') if transformation_fn is None: transformation_fn = lambda slice: slice # noqa: E731 else: if not callable(transformation_fn): raise TypeError( 'The transformation function must be a callable object.') required_params = fuv.get_required_parameters_number(transformation_fn) if required_params != 1: raise RuntimeError( 'The transformation function must have only one required ' 'parameter; now it has {}.'.format(required_params)) n_rows = data.shape[0] def _batch_data(): for i_start in np.arange(0, n_rows, batch_size): i_end = np.min([i_start + batch_size, n_rows]) data_slice_ = slice_fn(data, i_start, i_end) data_slice = transformation_fn(data_slice_) yield data_slice return _batch_data()
def local_fidelity_score( dataset: np.ndarray, data_row: Union[np.ndarray, np.void], global_predictive_function: PredictiveFunctionType, local_predictive_function: PredictiveFunctionType, metric_function: Callable[[np.ndarray, np.ndarray], float], explained_class_index: Optional[int] = None, explained_feature_indices: Optional[List[IndexType]] = None, fidelity_radius_percentage: int = 5, samples_number: int = 50) -> float: """ Computes local fidelity between a global and a local (surrogate) model. .. versionadded:: 0.0.2 For a selected data point (``data_row``), it samples uniformly around it within a hypersphere, which radius corresponds to a percentage -- defined with ``fidelity_radius_percentage`` parameter -- of the maximum l-2 distance between the specified data point and all the instances in the ``dataset``. (This sampling is based on :class:`fatf.utils.data.augmentation.LocalSphere` data augmenter.) .. warning:: A ``dataset`` with categorical features. This surrogate evaluation metric should **not** be used when the ``dataset`` contains *categorical features* (even when they are encoded, e.g., one-hot encoding) since the l-2 distance computed on mixed true numerical and (numerically-encoded) categorical features causes the local sample (computed with the :class:`fatf.utils.data.augmentation.LocalSphere` data augmenter) to be ill-defined. Feature scaling could possibly be used to overcome this issue, however we leave such consideration up to the user. The global and local predictive functions can be either: a probabilistic predictor, a (multi-class) classifier or a regressor. +-------+---------------------------------------------------------------+ | | Global Model | +-------+--------+-------------------------+--------------------+-------+ | Local | | |prob| | |clf| | |reg| | | Model +--------+-------------------------+--------------------+-------+ | | |prob| | OK, e.g., KL-divergence | OK, e.g., log-loss | |imp| | | +--------+-------------------------+--------------------+-------+ | | |clf| | OK (via thresholding) | OK | |imp| | | +--------+-------------------------+--------------------+-------+ | | |reg| | OK for a single class | |imp| | OK | +-------+--------+-------------------------+--------------------+-------+ .. |prob| replace:: **probabilistic** .. |clf| replace:: **classifier** .. |reg| replace:: **regressor** .. |imp| replace:: Not possible If the ``global_predictive_function`` outputs **probabilities**, the following should be considered for different types of a local model: * The local model is **probabilistic** as well: + a native probabilistic evaluation metric, such as the `Kullback–Leibler divergence`_, can be used; or + a thresholding can be applied or a top prediction can be chosen for both the local and the global probabilistic prediction and a classic classification performance metric can be used. * The local model is a **classifier** -- the probabilistic output of the global model has to be thresholded or the top prediction needs to be selected and a classic classification performance metric can be used. * The local model is a **regressor** -- this is only possible if the regressor is fitted for the probabilistic output of one of the classes. In this case any of the standard regression evaluation measures can be used. If the ``global_predictive_function`` is a **classifier**, the following should be considered for different types of a local model: * The local model is **probabilistic**: + a native performance metric, like log-loss_, can be used; or + the probabilistic output of the local predictor can be thresholded or the top label selected and compare using standard classification performance metrics. * The local model is a **classifier** as well -- any standard (multi-class) classification performance metric can be used. * Having a local **regressor** is not possible in this case. Finally, if the ``global_predictive_function`` is a **regressor**, the local model can **only** be a regressor as well, in which case any standard regression evaluation metric can be used. If the problem being modelled is multi-class (for probabilistic models and classifiers), the local model can either be fitted to the original multi-class problem or as one-vs-the-rest for a selected class. In the latter case, when the global model is probabilistic, the ``explained_class_index`` parameter may be used to specify the class (column index) that the ``data_row`` belongs to (according to the global model) -- in this case only the selected column with probabilities will be passed to the local fidelity score (``metric_function``) function. .. note:: Why to train the local model as one-vs-the-rest? When the local model is trained in the same output domain as the global model, the explanations extracted from this local model apply to all of the possible classes, what for some types of local models renders them uninformative. For example, consider training a decision tree locally and using the feature importance it provides. In this case we know which features are important in this local space but we cannot attribute these importances to any of the possible classes. However, a different type of explanation extracted from the same tree, for example, the logical conditions extracted from a root-to-leaf path that the selected ``data_row`` falls into, can be perfectly reasonable. If, on the other hand, the local model is trained as one-vs-the rest, where the "one" class is often set to be the class of the selected ``data_row``, any type of the explanation can be attributed to the selected class. In this case feature importances extracted from the local model can attributed to the selected class in the specified neighbourhood. This mode of training the local model is required when the global model is probabilistic and the local one is a regressor, and optional for all the other combinations of the two. The consequence of training the local model as one-vs-the-rest is the need for train a separate local model for every class desired to be explained. For some local models and explanation types this is a requirement. For example, when the local model is a linear regression (trained on probabilities of a selected class) the only possible explanation is feature importance, which is meaningless in other cases. In general, when evaluating the quality of a local surrogate, the most truthful measure would be the one achieved when the local model is trained on the same set of target classes. A good quality of a local one-vs-the-rest model with respect to the global model should be treated with caution as it only indicates that the local model excels at this task and may not be a good approximation of the global decisive process at all. Comparing quality of two local models where one is multi-class and the other one-vs-the-rest is relatively complex and should be done with caution (the former local model has a more difficult task to solve). Examples of how to define the ``metric_function`` can be found in the *Examples* section down below. This local fidelity evaluation is inspired by the local fidelity method introduced in [LAUGEL2018SPHERES]_. .. _`Kullback–Leibler divergence`: https://en.wikipedia.org/wiki/ Kullback–Leibler_divergence .. _log-loss: https://scikit-learn.org/stable/modules/ model_evaluation.html#log-loss .. [LAUGEL2018SPHERES] Laugel, T., Renard, X., Lesot, M. J., Marsala, C., & Detyniecki, M. (2018). Defining locality for surrogates in post-hoc interpretablity. Workshop on Human Interpretability for Machine Learning (WHI) -- International Conference on Machine Learning, 2018. Examples -------- The metric function should be adjusted to the type of the global and local predictors (and the use of the ``explained_class_index`` parameter). >>> import numpy as np >>> data = np.array([[0, 1], [1, 1], [1, 0]]) >>> targets = np.array(['a', 'b', 'c']) Let us assume that the global model is probabilistic, the local model is a regressor and we are explaining class ``'b'`` with index ``1``. (The index of the class is based on the lexicographical ordering of all the unique target values.) >>> explained_class_index = 1 >>> import fatf.utils.models.models as fatf_models >>> global_model = fatf_models.KNN(k=1) >>> global_model.fit(data, targets) >>> probabilities = global_model.predict_proba(data) >>> selected_class_probabilities = probabilities[:, explained_class_index] >>> local_model = fatf_models.KNN(k=1, mode='regressor') >>> local_model.fit(data, selected_class_probabilities) One way to evaluate the performance of our local (surrogate) model in this scenario is the *Mean Squared Error*: >>> def mse(global_predictions, local_predictions): ... mse = np.square(global_predictions - local_predictions) ... mse = mse.mean() ... return mse >>> import fatf.utils.transparency.surrogate_evaluation as surrogate_eval >>> mse_fidelity_score = surrogate_eval.local_fidelity_score( ... data, data[0], global_model.predict_proba, local_model.predict, ... mse, explained_class_index=explained_class_index) >>> mse_fidelity_score 0.0 Alternatively, if ``scikit-learn`` is available, an ROC can be computed, in which case the probabilities of the selected class need to be thresholded: >>> import sklearn.metrics >>> def roc(global_predictions, local_predictions): ... global_predictions[global_predictions >= .5] = 1 ... global_predictions[global_predictions < .5] = 0 ... global_predictions = global_predictions.astype(int) ... ... roc = sklearn.metrics.roc_auc_score(global_predictions, ... local_predictions) ... return roc >>> roc_fidelity_score = surrogate_eval.local_fidelity_score( ... data, data[1], global_model.predict_proba, local_model.predict, ... roc, explained_class_index=explained_class_index) >>> roc_fidelity_score 1.0 If both models are classifiers trained with the same set of target classes, >>> local_classifier = fatf_models.KNN(k=1) >>> local_classifier.fit(data, targets) a simple *accuracy* (implemented in FAT Forensics) can be used: >>> import fatf.utils.metrics.metrics as fatf_metrics >>> import fatf.utils.metrics.tools as fatf_metrics_tools >>> def accuracy(global_predictions, local_predictions): ... confusion_matrix = fatf_metrics_tools.get_confusion_matrix( ... local_predictions, global_predictions, labels=['a', 'b', 'c']) ... accuracy = fatf_metrics.accuracy(confusion_matrix) ... return accuracy >>> accuracy_fidelity_score = surrogate_eval.local_fidelity_score( ... data, data[2], global_model.predict, local_classifier.predict, ... accuracy) >>> accuracy_fidelity_score 1.0 (Note ``global_model.predict`` instead of ```global_model.predict_proba``.) Parameters ---------- dataset : numpy.ndarray A 2-dimensional numpy array with a dataset used to initialise the data sampler. data_row : Union[numpy.ndarray, numpy.void] A data point around which local fidelity is evaluated. global_predictive_function : Callable[[np.ndarray], np.ndarray] A Python callable (e.g., a function) that is responsible for predicting data points in the global model. This function can either be *probabilistic*, i.e., return a 2-dimensional numpy array with probabilities for every possible target class; a *regressor* (returning a 1-dimensional regression values array) or a *classifier* (returning a 1-dimensional class prediction array). Regardless of the type it **must** allow only **one required parameter** -- a 2-dimensional data array to be predicted. local_predictive_function : Callable[[np.ndarray], np.ndarray] A Python callable (e.g., a function) that is responsible for predicting data points in the local (surrogate) model. For more details about the allowed function types please see the description of the ``global_predictive_function`` parameter. metric_function : Callable[[numpy.ndarray, numpy.ndarray], float] A Python callable (e.g., a function) that computes a (performance) metric between the predictions of the global model (``global_predictive_function``) and the predictions of the local (surrogate) model (``local_predictive_function``). The passed callable object **must** take exactly **two required parameters**: the first one being predictions of the global model and the latter predictions of the local model, and return a number (float) representing performance comparison of the two. This callable object has to be adjusted to the types of global and local predictive functions. explained_class_index : integer, optional (default=None) If the global model (``global_predictive_function``) is probabilistic, this parameter allows to select a single column of probabilities for a selected class to be passed to the ``metric_function``. This parameter is useful when the local (surrogate) model is a regressor predicting probabilities of this chosen class (the class being explained). explained_feature_indices : List[IndexType], optional (default=None) If the local (surrogate) model was trained on a subset of the features, this parameter allows to indicate which features should be used when predicting the generated data with the local model. If ``None``, all of the features will be used. fidelity_radius_percentage : integer, optional (default=5) The locality of the fidelity measure is enforced by limiting the distance from the selected ``data_row`` to generated data, which is used for fidelity metric evaluation. This radius (of a hyper-sphere around the selected ``data_row``) is defined as a percentage of the largest l-2 distance between any two data points in the input ``dataset`` within which the evaluation data will be sampled. samples_number : integer, optional (default=50) The number of samples to be generated when computing the local fidelity score. Warns ----- UserWarning If the user specifies the ``explained_class_index`` parameter for a global model that is not probabilistic, this parameter is ignored, about what the user is warned. Raises ------ IncompatibleModelError The ``global_predictive_function`` or the ``local_predictive_function`` does not required **exactly one** parameter. IncorrectShapeError The input ``dataset`` is not a 2-dimensional numpy array. The input ``data_row`` is not 1-dimensional: either a 1-dimensional numpy array or a numpy void object for structured rows. The number of columns (features) in the ``data_row`` is different to the number of columns in the input ``dataset``. IndexError Some of the ``explained_feature_indices`` are invalid column indices for the input ``dataset``. TypeError The input ``dataset`` is not of a base type. The dtype of the ``data_row`` is too different from the dtype of the ``dataset``. The ``global_predictive_function`` or the ``local_predictive_function`` is not a Python callable. The ``metric_function`` is not a Python callable or it does not require **exactly** two parameters. The ``explained_class_index`` is neither ``None`` nor an integer. The ``explained_feature_indices`` is neither ``None`` nor a Python list. The ``fidelity_radius_percentage`` is not an integer. The ``samples_number`` is not an integer. ValueError The ``explained_class_index`` is a negative integer or out of bounds for the number of classes output by the global probabilistic model (``global_predictive_function``). The ``fidelity_radius_percentage`` is smaller than 1 or larger than 100. The ``samples_number`` is smaller than 1. Returns ------- fidelity_score : float A metric of "closeness" between the global and the local predictive function predictions calculated using the ``metric_function`` on the sampled data. """ # pylint: disable=too-many-arguments assert _validate_input_local_fidelity( dataset, data_row, global_predictive_function, local_predictive_function, metric_function, explained_class_index, explained_feature_indices, fidelity_radius_percentage, samples_number), 'Input is invalid.' augmentor = fuda.LocalSphere(dataset, int_to_float=False) sampled_data = augmentor.sample(data_row, fidelity_radius_percentage, samples_number) global_predictions = global_predictive_function(sampled_data) assert not fuav.is_structured_array(global_predictions), 'Is structured.' if explained_class_index is not None: assert fuav.is_2d_array(global_predictions), '2-D probabilities array.' global_predictions = global_predictions[:, explained_class_index] if explained_feature_indices is None: local_data = sampled_data else: if fuav.is_structured_array(sampled_data): local_data = sampled_data[explained_feature_indices] else: local_data = sampled_data[:, explained_feature_indices] local_predictions = local_predictive_function(local_data) fidelity_score = metric_function(global_predictions, local_predictions) return fidelity_score
def counterfactual_fairness_check(unfair_counterfactuals: Optional[ np.ndarray] = None, distances: Optional[np.ndarray] = None, threshold: Optional[float] = None) -> bool: """ Checks for counterfactual fairness using a counterfactual fairness arrays. There are two different approaches to evaluate counterfactual fairness. The first one is to take the ``distances`` to the counterfactual examples and see whether there are any that exceed a certain ``threshold`` in which case a given instance is considered to be treated unfairly. Alternatively by using the ``unfair_counterfactuals`` array this function checks whether there are any unfair counterfactual instances. In case all the input parameters are given **the distance-based approach takes the precedence**. Parameters ---------- unfair_counterfactuals : numpy.ndarray, optional (default=None) A 2-dimensional numpy array with counterfactual examples that expose unfairness of a prediction. distances : numpy.ndarray, optional (default=None) A 1-dimensional numpy array with . threshold : number, optional (default=None) A numerical threshold above which a counterfactual instance is too far, therefore it is considered to be an exemplar of individual unfairness. Raises ------ IncorrectShapeError The ``unfair_counterfactuals`` parameter is not a 2-dimensional array. The ``distances`` parameter is not a 1-dimensional array. RuntimeError Either of the required input parameters were not given: ``unfair_counterfactuals`` or ``distances`` and ``threshold``. TypeError The ``threshold`` parameter is not a number. ValueError The ``distances`` array is not purely numerical. Returns ------- counterfactually_unfair : boolean ``True`` if there are any counterfactually unfair instances, ``False`` otherwise. """ if distances is not None and threshold is not None: if not fuav.is_1d_array(distances): raise IncorrectShapeError('The distances parameter has to be a ' '1-dimensional array.') if not fuav.is_numerical_array(distances): raise ValueError('The distances array has to be purely numerical.') if not isinstance(threshold, Number): raise TypeError('The threshold parameter has to be a number.') counterfactually_unfair = (distances > threshold).any() elif unfair_counterfactuals is not None: if not fuav.is_2d_array(unfair_counterfactuals): raise IncorrectShapeError('The unfair counterfactuals parameter ' 'has to be a 2-dimensional numpy array.') counterfactually_unfair = bool(unfair_counterfactuals.size) else: raise RuntimeError('Either of the two is required to run this ' 'function: unfair_counterfactuals parameter or ' 'both distances and threshold parameters.') return counterfactually_unfair
def fit(self, X: np.ndarray, y: np.ndarray) -> None: """ Fits the model. Parameters ---------- X : numpy.ndarray The KNN training data. y : numpy.ndarray The KNN training labels. Raises ------ IncorrectShapeError Either the ``X`` array is not 2-dimensional, the ``y`` array is not 1-dimensional, the number of rows in ``X`` is not the same as the number of elements in ``y`` or the ``X`` array has 0 rows or 0 columns. PrefittedModelError Trying to fit the model when it has already been fitted. Usually raised when calling the ``fit`` method for the second time without clearing the model first. TypeError Trying to fit a KNN predictor in a regressor mode with non-numerical target variable. """ if self._is_fitted: raise PrefittedModelError('This model has already been fitted.') if not fuav.is_2d_array(X): raise IncorrectShapeError('The training data must be a 2-' 'dimensional array.') if not fuav.is_1d_array(y): raise IncorrectShapeError('The training data labels must be a 1-' 'dimensional array.') if X.shape[0] == 0: raise IncorrectShapeError('The data array has to have at least ' 'one data point.') # If the array is structured the fuav.is_2d_array function takes care # of checking whether there is at least one column if not fuav.is_structured_array(X) and X.shape[1] == 0: raise IncorrectShapeError('The data array has to have at least ' 'one feature.') if X.shape[0] != y.shape[0]: raise IncorrectShapeError('The number of samples in X must be the ' 'same as the number of labels in y.') if not self._is_classifier and not fuav.is_numerical_array(y): raise TypeError('Regressor can only be fitted for a numerical ' 'target vector.') numerical_indices, categorical_indices = fuat.indices_by_type(X) self._numerical_indices = numerical_indices self._categorical_indices = categorical_indices self._is_structured = fuav.is_structured_array(X) self._X = X self._y = y if self._is_classifier: unique_y, unique_y_counts = np.unique(self._y, return_counts=True) # Order labels lexicographically. unique_y_sort_index = np.argsort(unique_y) self._unique_y = unique_y[unique_y_sort_index] self._unique_y_counts = unique_y_counts[unique_y_sort_index] # How many other labels have the same count. top_y_index = self._unique_y_counts == np.max( self._unique_y_counts) top_y_unique_sorted = np.sort(self._unique_y[top_y_index]) self._majority_label = top_y_unique_sorted[0] self._unique_y_probabilities = (self._unique_y_counts / self._y.shape[0]) else: self._majority_label = self._y.mean() self._unique_y = np.ndarray((0, )) self._unique_y_counts = np.ndarray((0, )) self._unique_y_probabilities = np.ndarray((0, )) self._X_n = self._X.shape[0] self._is_fitted = True
def indices_by_type(array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Identifies indices of columns with numerical and non-numerical values. Checks whether a numpy array is purely numerical or a structured array and returns two numpy arrays: the first-one with indices of numerical columns and the second-one with indices of non-numerical columns. Parameters ---------- array : numpy.ndarray A numpy array to be checked (it has to be a 2-dimensional array). Raises ------ TypeError The input array is not a numpy array-like object. ValueError The input array consists of complex types such as numpy void and object-like types that are not supported by this function. IncorrectShapeError The input array is not 2-dimensional. Returns ------- numerical_indices : numpy.ndarray A numpy array containing indices of the numerical columns of the input array. non_numerical_indices : numpy.ndarray A numpy array containing indices of the non-numerical columns of the input array. """ if not isinstance(array, np.ndarray): raise TypeError('The input should be a numpy array-like.') if not fuav.is_2d_array(array): raise IncorrectShapeError('The input array should be 2-dimensional.') if not fuav.is_base_array(array): raise ValueError('indices_by_type only supports input arrays that ' 'hold base numpy types, i.e. numerical and ' 'string-like -- numpy void and object-like types are ' 'not allowed.') if fuav.is_structured_array(array): assert len(array.dtype) > 1, 'This should be a 2D array.' numerical_indices_list = [] non_numerical_indices_list = [] for column_name in array.dtype.names: column_dtype = array.dtype[column_name] if fuav.is_numerical_dtype(column_dtype): numerical_indices_list.append(column_name) else: non_numerical_indices_list.append(column_name) numerical_indices = np.array(numerical_indices_list) non_numerical_indices = np.array(non_numerical_indices_list) else: if fuav.is_numerical_array(array): numerical_indices = np.array(range(array.shape[1])) non_numerical_indices = np.empty((0, ), dtype='i8') else: numerical_indices = np.empty((0, ), dtype='i8') non_numerical_indices = np.array(range(array.shape[1])) return numerical_indices, non_numerical_indices
def _validate_input_local_fidelity( dataset: np.ndarray, data_row: Union[np.ndarray, np.void], global_predictive_function: PredictiveFunctionType, local_predictive_function: PredictiveFunctionType, metric_function: Callable[[np.ndarray, np.ndarray], float], explained_class_index: Union[int, None], explained_feature_indices: Union[List[IndexType], None], fidelity_radius_percentage: int, samples_number: int) -> bool: """ Validates the input parameters for the ``local_fidelity_score`` function. This function validates input parameter of the :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score` function. The description of this function's input parameters, errors and exceptions can be found therein. Returns ------- is_input_ok : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-arguments,too-many-branches,too-many-statements is_input_ok = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input dataset must be a ' '2-dimensional numpy array.') if not fuav.is_base_array(dataset): raise TypeError('The input dataset must be of a base type -- numbers ' 'and/or strings.') if not fuav.is_1d_like(data_row): raise IncorrectShapeError('The data_row must either be a ' '1-dimensional numpy array or a numpy ' 'void object for structured data rows.') are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row])) if not are_similar: raise TypeError('The dtype of the data_row is too different from ' 'the dtype of the dataset array.') # If the dataset is structured and the data_row has a different # number of features this will be caught by the above dtype check. # For classic numpy arrays this has to be done separately. if not fuav.is_structured_array(dataset): if dataset.shape[1] != data_row.shape[0]: raise IncorrectShapeError('The data_row must contain the same ' 'number of features as the dataset.') if callable(global_predictive_function): global_params_n = fuv.get_required_parameters_number( global_predictive_function) if global_params_n != 1: raise IncompatibleModelError( 'The global predictive function must have exactly *one* ' 'required parameter to work with this metric.') else: raise TypeError('The global_predictive_function should be a Python ' 'callable, e.g., a Python function.') if callable(local_predictive_function): local_params_n = fuv.get_required_parameters_number( local_predictive_function) if local_params_n != 1: raise IncompatibleModelError( 'The local predictive function must have exactly *one* ' 'required parameter to work with this metric.') else: raise TypeError('The local_predictive_function should be a Python ' 'callable, e.g., a Python function.') if callable(metric_function): if fuv.get_required_parameters_number(metric_function) != 2: raise TypeError('The metric_function must take exactly *two* ' 'required parameters.') else: raise TypeError('The metric_function should be a Python callable, ' 'e.g., a Python function.') # Explained class index global_prediction = global_predictive_function(dataset[:1]) assert not fuav.is_structured_array(global_prediction), 'Must be plain.' assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.' if fuav.is_2d_array(global_prediction): # A probabilistic model. if explained_class_index is not None: if isinstance(explained_class_index, int): if (explained_class_index >= global_prediction.shape[1] or explained_class_index < 0): raise ValueError('The explained_class_index parameter is ' 'negative or larger than the number of ' 'classes output by the global ' 'probabilistic model.') else: raise TypeError('For probabilistic global models, i.e., ' 'global predictive functions, the ' 'explained_class_index parameter has to be an ' 'integer or None.') elif fuav.is_1d_array(global_prediction): if explained_class_index is not None: warnings.warn( 'The explained_class_index parameter is not None and will be ' 'ignored since the global model is not probabilistic.', UserWarning) else: assert False, ('Global predictor must output a 1- or 2-dimensional ' 'numpy array.') # pragma: nocover if explained_feature_indices is not None: if isinstance(explained_feature_indices, list): invalid_indices = fuat.get_invalid_indices( dataset, np.asarray(explained_feature_indices)) if invalid_indices.size: raise IndexError( 'The following column indices are invalid for the input ' 'dataset: {}.'.format(invalid_indices)) else: raise TypeError('The explained_feature_indices parameter must be ' 'a Python list or None.') if isinstance(fidelity_radius_percentage, int): if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100: raise ValueError('The fidelity_radius_percentage must be an ' 'integer between 1 and 100.') else: raise TypeError('The fidelity_radius_percentage must be an integer ' 'between 1 and 100.') if isinstance(samples_number, int): if samples_number < 1: raise ValueError('The samples_number must be a positive integer.') else: raise TypeError('The samples_number must be an integer.') is_input_ok = True return is_input_ok
def _get_distances(self, X: np.ndarray) -> np.ndarray: """ Gets distances for a mixture of numerical and categorical features. For numerical columns the distance is calculated as the Euclidean distance. For categorical columns (i.e. non-numerical, e.g. strings) the distance is 0 when the value matches and 1 otherwise. Parameters ---------- X : numpy.ndarray A data array for which distances to the training data will be calculated. Raises ------ AssertionError Raised when the model is not fitted, X is not a 2-dimensional array or X's dtype is different than training data's dtype. It is also raised when the distances matrix is not 2-dimensional. Returns ------- distances : numpy.ndarray An array of distances between X and the training data. """ # pylint: disable=invalid-name assert self._is_fitted, 'Cannot calculate distances on unfitted model.' assert fuav.is_2d_array(X), 'X must be a 2-dimensional array.' assert fuav.are_similar_dtype_arrays(X, self._X), \ 'X must have the same dtype as the training data.' distances_shape = (self._X.shape[0], X.shape[0]) categorical_distances = np.zeros(distances_shape) numerical_distances = np.zeros(distances_shape) if self._is_structured: if self._categorical_indices.size: categorical_distances = fud.binary_array_distance( self._X[self._categorical_indices], X[self._categorical_indices]) if self._numerical_indices.size: numerical_distances = fud.euclidean_array_distance( self._X[self._numerical_indices], X[self._numerical_indices]) else: if self._categorical_indices.size: categorical_distances = fud.binary_array_distance( self._X[:, self._categorical_indices], X[:, self._categorical_indices]) if self._numerical_indices.size: numerical_distances = fud.euclidean_array_distance( self._X[:, self._numerical_indices], X[:, self._numerical_indices]) assert categorical_distances.shape == numerical_distances.shape, \ 'Different number of point-wise distances for these feature types.' distances = categorical_distances + numerical_distances assert fuav.is_2d_array(distances), 'Distances matrix must be 2D.' return distances
def _validate_input_dc(data_set: np.ndarray, categorical_indices: Union[None, List[Index]], neighbours: int, distance_function: Union[None, DistanceFunction], normalise_scores: bool) -> bool: """ Validates ``DensityCheck`` class initialiser's input parameters. Parameters ---------- data_set : numpy.ndarray A 2-dimensional numpy array (either classic or structured) of a base type. categorical_indices : Union[None, List[column index]], Either ``None`` or a list of column indices to be treated as categorical. neighbours : integer The number of closest neighbours to be considered. distance_function : Union[None, Callable[[data row, data row], number]] Either ``None`` or a Python function that calculates a distance between two data points. This function takes as an input two 1-dimensional numpy arrays (for classic numpy arrays) or numpy voids (fro structured numpy arrays) of equal length and outputs a number representing a distance between them. **The distance function is assumed to return the same distance regardless of the order in which the input parameters are given.** normalise_scores : boolean A boolean parameter indicating whether to normalise the scores (``True``) or not (``False``). Raises ------ AttributeError The distance function does not require exactly 2 non-optional parameters. IncorrectShapeError The ``data_set`` array is not 2-dimensional. IndexError Some of the provided categorical column indices are invalid for the ``data_set`` array. TypeError The ``data_set`` array is not of a base type (strings and/or numbers). The ``neighbours`` parameter is not an integer. The ``distance_function`` is neither ``None`` nor Python callable (a function). The ``normalise_scores`` parameter is not a boolean. The ``categorical_indices`` parameter is not a Python list. ValueError The ``neighbours`` parameter is smaller than 1 or larger than the number of instances (rows) in the ``data_set`` array. Returns ------- is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-branches is_valid = False if not fuav.is_2d_array(data_set): raise IncorrectShapeError('The data set should be a 2-dimensional ' 'numpy array.') if not fuav.is_base_array(data_set): raise TypeError('The data set is not of a base type (numbers and/or ' 'strings.') if categorical_indices is not None: if isinstance(categorical_indices, list): invalid_indices = fuat.get_invalid_indices( data_set, np.asarray(categorical_indices)).tolist() if invalid_indices: raise IndexError('The following indices are invalid for the ' 'input data set: {}.'.format(invalid_indices)) else: raise TypeError('The categorical_indices parameter must be a ' 'Python list or None.') if isinstance(neighbours, int): if neighbours < 1 or neighbours > data_set.shape[0]: raise ValueError('The neighbours number parameter has to be ' 'between 1 and number of data points (rows) in ' 'the data set array.') else: raise TypeError('The neighbours number parameter has to be an ' 'integer.') if distance_function is not None: if callable(distance_function): required_param_n = 0 params = inspect.signature(distance_function).parameters for param in params: if params[param].default is params[param].empty: required_param_n += 1 if required_param_n != 2: raise AttributeError('The distance function must require ' 'exactly 2 parameters. Given function ' 'requires {} ' 'parameters.'.format(required_param_n)) else: raise TypeError('The distance function should be a Python ' '(function).') if not isinstance(normalise_scores, bool): raise TypeError('The normalise scores parameter should be a boolean.') is_valid = True return is_valid
def predict(self, X: np.ndarray) -> np.ndarray: """ Predicts labels of new instances with the fitted model. Parameters ---------- X : numpy.ndarray The data for which labels will be predicted. Raises ------ IncorrectShapeError X is not a 2-dimensional array, it has 0 rows or it has a different number of columns than the training data. UnfittedModelError Raised when trying to predict data when the model has not been fitted yet. Try using the ``fit`` method to fit the model first. ValueError X has a different dtype than the data used to fit the model. Returns ------- predictions : numpy.ndarray Predicted class labels for each data point. """ # pylint: disable=too-many-locals,too-many-branches if not self._is_fitted: raise UnfittedModelError('This model has not been fitted yet.') if not fuav.is_2d_array(X): raise IncorrectShapeError('X must be a 2-dimensional array. If ' 'you want to predict a single data ' 'point please format it as a single row ' 'in a 2-dimensional array.') if not fuav.are_similar_dtype_arrays(X, self._X): raise ValueError('X must have the same dtype as the training ' 'data.') if not X.shape[0]: raise IncorrectShapeError('X must have at least one row.') # No need to check for columns in a structured array -> this is handled # by the dtype checker. if not fuav.is_structured_array(X): if X.shape[1] != self._X.shape[1]: raise IncorrectShapeError(('X must have the same number of ' 'columns as the training data ' '({}).').format(self._X.shape[1])) predictions = np.empty((X.shape[0], )) if self._k < self._X_n: distances = self._get_distances(X) # If there are 3 nearest neighbours within distances 1, 2 and 2 and # k is set to 2, then argpartition will always take the first # within distance 2. knn = np.argpartition(distances, self._k, axis=0) predictions = [] for column in knn.T: close_labels = self._y[column[:self._k]] if self._is_classifier: values, counts = np.unique(close_labels, return_counts=True) # If there is a tie in the counts take into consideration # the overall label count in the training data to resolve # it. top_label_index = counts == counts.max() top_label_unique_sorted = np.sort(values[top_label_index]) assert len(top_label_unique_sorted.shape) == 1, \ 'This should be a flat array.' if top_label_unique_sorted.shape[0] > 1: # Resolve the tie. # Get count of these label for the training data. labels_filter = np.array(self._unique_y.shape[0] * [False]) for top_prediction in top_label_unique_sorted: unique_y_filter = self._unique_y == top_prediction np.logical_or(labels_filter, unique_y_filter, out=labels_filter) g_top_label = self._unique_y[labels_filter] g_top_label_counts = ( self._unique_y_counts[labels_filter]) # What if any of the global labels have the same count? g_top_label_index = g_top_label_counts == np.max( g_top_label_counts) g_top_label_sorted = np.sort( g_top_label[g_top_label_index]) prediction = g_top_label_sorted[0] else: prediction = top_label_unique_sorted[0] else: prediction = close_labels.mean() predictions.append(prediction) predictions = np.array(predictions) else: predictions = np.array(X.shape[0] * [self._majority_label]) return predictions
def validate_confusion_matrix(confusion_matrix: np.ndarray, label_index: Optional[int] = None) -> bool: """ Validates a confusion matrix. This function checks whether the ``confusion_matrix`` is 2-dimensional, square, unstructured and of integer kind. If the ``label_index`` parameter is given, it is checked to be a valid index for the given confusion matrix. Parameters ---------- confusion_matrix : numpy.ndarray A confusion matrix to be validated. label_index : integer, optional (default=None) An index which validity will be checked for the confusion matrix (if not ``None``). Raises ------ IncorrectShapeError The confusion matrix is not a 2-dimensional numpy array, it is not square (equal width and height) or its dimension is not at least 2x2. IndexError The ``label_index`` (if given) is not valid for the confusion matrix. TypeError The confusion matrix is not of an integer kind (e.g. ``int``, ``numpy.int32``, ``numpy.int64``). The ``label_index`` is not an integer. ValueError The confusion matrix is a structured numpy array. Returns ------- is_valid : boolean ``True`` if the confusion matrix is valid, ``False`` otherwise. """ is_valid = False if not fuav.is_2d_array(confusion_matrix): raise IncorrectShapeError('The confusion matrix has to be a ' '2-dimensional numpy array.') if fuav.is_structured_array(confusion_matrix): raise ValueError('The confusion matrix cannot be a structured numpy ' 'array.') if confusion_matrix.shape[0] != confusion_matrix.shape[1]: raise IncorrectShapeError('The confusion matrix has to be a square ' '(equal width and height) numpy array.') if confusion_matrix.shape[1] < 2: raise IncorrectShapeError('The confusion matrix needs to be at least ' '2x2.') if confusion_matrix.dtype.kind != 'i': raise TypeError('The confusion matrix has to be of integer kind.') if label_index is not None: if not isinstance(label_index, int): raise TypeError('The label index has to be an integer.') if label_index < 0 or label_index >= confusion_matrix.shape[0]: msg = ('The label index {} is not a valid index for the confusion ' 'matrix of shape {}x{}.') msg = msg.format(label_index, confusion_matrix.shape[0], confusion_matrix.shape[1]) raise IndexError(msg) is_valid = True return is_valid
def predict_proba(self, X: np.ndarray) -> np.ndarray: """ Calculates label probabilities for new instances with the fitted model. Parameters ---------- X : numpy.ndarray The data for which labels probabilities will be predicted. Raises ------ IncorrectShapeError X is not a 2-dimensional array, it has 0 rows or it has a different number of columns than the training data. UnfittedModelError Raised when trying to predict data when the model has not been fitted yet. Try using the ``fit`` method to fit the model first. RuntimeError Raised when trying to use this method when the predictor is initialised as a regressor. ValueError X has a different dtype than the data used to fit the model. Returns ------- probabilities : numpy.ndarray Probabilities of each instance belonging to every class. The labels in the return array are ordered by lexicographic order. """ if not self._is_classifier: raise RuntimeError('This functionality is not available for a ' 'regressor.') if not self._is_fitted: raise UnfittedModelError('This model has not been fitted yet.') if not fuav.is_2d_array(X): raise IncorrectShapeError('X must be a 2-dimensional array. If ' 'you want to predict a single data ' 'point please format it as a single row ' 'in a 2-dimensional array.') if not fuav.are_similar_dtype_arrays(X, self._X): raise ValueError('X must have the same dtype as the training ' 'data.') if not X.shape[0]: raise IncorrectShapeError('X must have at least one row.') # No need to check for columns in a structured array -> this is handled # by the dtype checker. if not fuav.is_structured_array(X): if X.shape[1] != self._X.shape[1]: raise IncorrectShapeError(('X must have the same number of ' 'columns as the training data ' '({}).').format(self._X.shape[1])) probabilities = np.empty((X.shape[0], self._unique_y.shape[0])) if self._k < self._X_n: distances = self._get_distances(X) knn = np.argpartition(distances, self._k, axis=0) probabilities = [] for column in knn.T: close_labels = self._y[column[:self._k]] values, counts = np.unique(close_labels, return_counts=True) total_counts = np.sum(counts) probs = np.zeros((self._unique_y.shape[0], )) for i in range(values.shape[0]): ind = np.where(self._unique_y == values[i])[0] probs[ind] = counts[i] / total_counts probabilities.append(probs) probabilities = np.array(probabilities) else: probabilities = np.tile(self._unique_y_probabilities, (X.shape[0], 1)) return probabilities
def _validate_input_lasso_path(dataset: np.ndarray, target: np.ndarray, weights: Union[np.ndarray, None], features_number: Union[int, None], features_percentage: int) -> bool: """ Validates the input parameters of the ``lasso_path`` function. For the input parameter description, warnings and exceptions please see the documentation of the :func:`fatf.utils.data.feature_selection.sklearn.lasso_path` function. Returns ------- input_is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-branches input_is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input data set must be a 2-dimensional ' 'array.') if not fuav.is_numerical_array(dataset): raise TypeError('The input data set must be purely numerical. (The ' 'lasso path feature selection is based on ' 'sklearn.linear_model.lars_path function.)') if not fuav.is_1d_array(target): raise IncorrectShapeError('The target array must be a 1-dimensional ' 'array.') if not fuav.is_numerical_array(target): raise TypeError('The target array must be numerical since this ' 'feature selection method is based on Lasso ' 'regression.') if target.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of labels in the target array ' 'must agree with the number of samples in ' 'the data set.') if weights is not None: if not fuav.is_1d_array(weights): raise IncorrectShapeError('The weights array must 1-dimensional.') if not fuav.is_numerical_array(weights): raise TypeError('The weights array must be purely numerical.') if weights.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of weights in the weights ' 'array must be the same as the number ' 'of samples in the input data set.') if features_number is not None: if not isinstance(features_number, int): raise TypeError('The features_number parameter must be an ' 'integer.') if features_number < 1: raise ValueError('The features_number parameter must be a ' 'positive integer.') if not isinstance(features_percentage, int): raise TypeError('The feature_percentage parameter must be an integer.') if features_percentage < 0 or features_percentage > 100: raise ValueError('The feature_percentage parameter must be between 0 ' 'and 100 (inclusive).') input_is_valid = True return input_is_valid
def occlude_segments_vectorised( self, vectorised_segments_subset: np.ndarray, image: Optional[np.ndarray] = None, colour: Optional[Union[str, int, RGBcolour]] = None) -> np.ndarray: """ Generates multiple images with a selected subsets of segments occluded. The segments to be occluded are provided as boolean vectors; either a 1-D numpy array of length equal to the number of segments to produce a single occluded image, or a 2-D array where each row represents a separate occlusion pattern. In this format the n-th element or column corresponds to the the n+1 segment id; 1 indicates that the segment should be preserved and 0 that it should be occluded. The occlusion is applied on top of the image used to initialise this class; alternatively, an external ``image`` of the same type and dimensions can be supplied. If a colouring strategy different to the one of the class is desired, it can be specified via the ``colour`` parameter. Parameters ---------- vectorised_segments_subset : numpy.ndarray A 1-D boolean occlusion vector of the length equal to the number of segments or a 2-D boolean matrix of the (number of occlusion images to generate X number of segments) shape. image : numpy.ndarray, optional (default=None) If provided, this ``image`` will be occluded instead of the one used to initialise this class. colour : string, integer, tuple(integer, integer, integer), \ optional (default=None) A colour specifier. By default (``colour=None``) the colouring strategy of the class is used. See the documentation of the :func:`fatf.utils.data.occlusion.Occlusion.set_colouring_strategy` method for more details. Raises ------ IncorrectShapeError The ``vectorised_segments_subset`` numpy array is neither 1- nor 2-dimensional. The number of elements in ``vectorised_segments_subset`` (when it is 1-D) does not correspond to the number of segments. The number of columns in ``vectorised_segments_subset`` (when it is 2-D) does not correspond to the number of segments. The input ``image`` is neither a 2- nor 3-dimensional numpy array. The the height, width or the number of channels in the ``image`` array does not agree with the same parameters of the class image. TypeError The ``vectorised_segments_subset`` numpy array is not boolean. Returns ------- image_occluded : numpy.ndarray A numpy array holding the image(s) with the selected subset(s) of segments occluded. """ # pylint: disable=too-many-branches if image is None: canvas = self.image else: assert ( # yapf: disable fuds._validate_image_array( # pylint: disable=protected-access image, 'image')), 'Invalid image.' if image.shape != self.image.shape: raise IncorrectShapeError( 'The width, height or number of channels of the input ' 'image does not agree with the same parameters of the ' 'original image.') canvas = image if colour is None: colouring_strategy = self._colouring_strategy else: colouring_strategy = self._generate_colouring_strategy(colour) if fuav.is_structured_array(vectorised_segments_subset): raise TypeError('The vector representation of segments cannot be ' 'a structured numpy array.') if not fuav.is_numerical_array(vectorised_segments_subset): raise TypeError('The vector representation of segments should be ' 'a numerical numpy array.') if fuav.is_1d_array(vectorised_segments_subset): if vectorised_segments_subset.shape[0] != self.segments_number: raise IncorrectShapeError( ('The number of elements ({}) in the vector ' 'representation of segments should correspond to the ' 'unique number of segments ({}).').format( vectorised_segments_subset.shape[0], self.segments_number)) samples = 1 vectorised_segments_subset = np.asarray( [vectorised_segments_subset]) elif fuav.is_2d_array(vectorised_segments_subset): if vectorised_segments_subset.shape[1] != self.segments_number: raise IncorrectShapeError( ('The number of columns ({}) in the vector representation ' 'of segments should correspond to the unique number of ' 'segments ({}).').format( vectorised_segments_subset.shape[1], self.segments_number)) samples = vectorised_segments_subset.shape[0] else: raise IncorrectShapeError( 'The vector representation of segments should be a 1- or ' '2-dimensional numpy array.') _unique_entries = set(np.unique(vectorised_segments_subset).astype( int)).difference((0, 1)) # yapf: disable if _unique_entries: raise TypeError('The vector representation of segments should be ' 'binary numpy array.') # image_occluded = canvas.copy() image_occluded = np.repeat(canvas[np.newaxis, :], samples, axis=0) for i, vec in enumerate(vectorised_segments_subset): # Get ids of segments to be occluded (0s) from a vector form # 1 is added as segments are numbered from 1, not 0 segments_subset = np.where(vec == 0)[0] + 1 occlusion_mask = fuds.get_segment_mask(segments_subset.tolist(), self.segments) image_occluded[i, occlusion_mask] = colouring_strategy( occlusion_mask) if samples == 1: image_occluded = image_occluded[0] return image_occluded