Esempio n. 1
0
    def _validate_data_point(self, data_point: DataRow, clip: bool) -> bool:
        """
        Validates input parameters of the ``score_data_point`` method.

        Parameters
        ----------
        data_point : Union[numpy.array, numpy.void]
            A data row. For numpy arrays this will be a numpy ndarray. For
            structured numpy arrays this will be numpy void.

        Raises
        ------
        IncorrectShapeError
            The data point is not 1-dimensional numpy array (either numpy
            ndarray for classic numpy arrays or numpy void for structured numpy
            arrays). The data point does not have the same number of columns
            (features) as the data set used to initialise this class.
        TypeError
            The data point is not of a base type (strings and/or numbers). The
            dtype of the data point is too different from the dtype of the
            data set used to initialise this class. The ``clip`` parameter is
            not a boolean.

        Returns
        -------
        is_valid : boolean
            ``True`` if the input parameters are valid, ``False`` otherwise.
        """
        is_valid = False

        if not fuav.is_1d_like(data_point):
            raise IncorrectShapeError('The data point has to be 1-dimensional '
                                      'numpy array or numpy void (for '
                                      'structured arrays).')
        data_point_array = np.asarray([data_point])
        if not fuav.is_base_array(data_point_array):
            raise TypeError('The data point has to be of a base type (strings '
                            'and/or numbers).')
        if not fuav.are_similar_dtype_arrays(self.data_set, data_point_array):
            raise TypeError('The dtypes of the data set used to initialise '
                            'this class and the provided data point are too '
                            'different.')
        # Testing only for unstructured as the dtype comparison picks up on a
        # different number of columns in a structured array
        if not self._is_structured:
            if self.data_set.shape[1] != data_point_array.shape[1]:
                raise IncorrectShapeError('The data point has different '
                                          'number of columns (features) than '
                                          'the data set used to initialise '
                                          'this class.')

        if not isinstance(clip, bool):
            raise TypeError('The clip parameter has to be a boolean.')

        is_valid = True
        return is_valid
Esempio n. 2
0
def _validate_get_distance(
        data_array: np.ndarray,
        distance_function: Callable[[np.ndarray, np.ndarray], float]) -> bool:
    """
    Validates ``data_array`` and ``distance_function`` parameters.

    Parameters
    ----------
    data_array : numpy.ndarray
        A 2-dimensional numpy array.
    distance_function : Callable[[numpy.ndarray, numpy.ndarray], number]
        A Python function that takes as an input two 1-dimensional numpy arrays
        of equal length and outputs a number representing a distance between
        them.

    Raises
    ------
    AttributeError
        The distance function does not require exactly two parameters.
    IncorrectShapeError
        The data array is not a 2-dimensional numpy array.
    TypeError
        The data array is not of a base type (numbers and/or strings). The
        distance function is not a Python callable (function).

    Returns
    -------
    is_valid : boolean
        ``True`` if the parameters are valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(data_array):
        raise IncorrectShapeError('The data_array has to be a 2-dimensional '
                                  '(structured or unstructured) numpy array.')
    if not fuav.is_base_array(data_array):
        raise TypeError('The data_array has to be of a base type (strings '
                        'and/or numbers).')

    if callable(distance_function):
        required_param_n = 0
        params = inspect.signature(distance_function).parameters
        for param in params:
            if params[param].default is params[param].empty:
                required_param_n += 1
        if required_param_n != 2:
            raise AttributeError('The distance function must require exactly '
                                 '2 parameters. Given function requires {} '
                                 'parameters.'.format(required_param_n))
    else:
        raise TypeError('The distance function should be a Python callable '
                        '(function).')

    is_valid = True
    return is_valid
Esempio n. 3
0
def _input_is_valid(dataset: np.ndarray,
                    model: object,
                    feature_index: Union[int, str],
                    treat_as_categorical: Optional[bool],
                    steps_number: Optional[int]) -> bool:  # yapf: disable
    """
    Validates input parameters of Individual Conditional Expectation function.

    For the input parameter description, warnings and exceptions please see the
    documentation of the :func`fatf.transparency.model.feature_influence.
    individual_conditional_expectation` function.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'array.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input dataset must only contain base types '
                         '(textual and numerical).')

    if not fumv.check_model_functionality(model, require_probabilities=True):
        raise IncompatibleModelError('This functionality requires the model '
                                     'to be capable of outputting '
                                     'probabilities via predict_proba method.')

    if not fuat.are_indices_valid(dataset, np.array([feature_index])):
        raise IndexError('Provided feature index is not valid for the input '
                         'dataset.')

    if isinstance(steps_number, int):
        if steps_number < 2:
            raise ValueError('steps_number has to be at least 2.')
    elif steps_number is None:
        pass
    else:
        raise TypeError('steps_number parameter has to either be None or an '
                        'integer.')

    if (not isinstance(treat_as_categorical, bool)
            and treat_as_categorical is not None):
        raise TypeError('treat_as_categorical has to either be None or a '
                        'boolean.')

    is_input_ok = True
    return is_input_ok
def _validate_input_drm(dataset: np.ndarray, data_row: Union[np.ndarray,
                                                             np.void]) -> bool:
    """
    Validates :func:`fatf.utils.data.transformation.dataset_row_masking` input.

    This function checks if ``dataset`` is a 2-dimensional array and if
    ``data_row`` is a 1-dimensional array of the same length as the number of
    columns in the ``dataset``. It also checks if they have valid and
    compatible dtypes.

    For the description of input parameters, and warnings and exceptions raised
    by this function please see the  documentation of the
    :func:`fatf.utils.data.transformation.dataset_row_masking` function.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- text, '
                        'numbers or mixture of the two.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data row must either be a '
                                  '1-dimensional numpy array or a numpy void '
                                  'object for structured rows.')

    # For structured arrays the dtype check also checks the number of columns
    are_similar = fuav.are_similar_dtype_arrays(dataset,
                                                np.array([data_row]),
                                                strict_comparison=False)
    if not are_similar:
        raise TypeError('The dtype of the data row is too different from the '
                        'dtype of the dataset provided.')

    # Since the types agree both, the row and the data set, have to be
    # structured or plane
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data row must contain the same '
                                      'number of elements as the number of '
                                      'columns in the provided dataset.')

    is_valid = True
    return is_valid
def _validate_input(dataset: np.ndarray, explain_instance: Callable,
                    sample_size: int, explanations_number: int) -> bool:
    """
    Validates input for submodular pick.

    For the input parameters description, warnings and exceptions please see
    the documentation of the :func:`fatf.transparency.models.submodular_pick`
    function.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError(
            'The input data set must be a 2-dimensional array.')
    if not fuav.is_base_array(dataset):
        raise ValueError('The input data set must only contain base types '
                         '(strings and numbers).')

    if not isinstance(sample_size, int):
        raise TypeError('sample_size must be an integer.')
    if sample_size < 0:
        raise ValueError('sample_size must be a non-negative integer.')

    if not isinstance(explanations_number, int):
        raise TypeError('explanations_number must be an integer.')
    if explanations_number is not None and explanations_number < 0:
        raise ValueError('explanations_number must be a non-negative integer.')

    if (sample_size and explanations_number
            and sample_size < explanations_number):
        raise ValueError('The number of explanations cannot be larger than '
                         'the number of samples.')

    if callable(explain_instance):
        params_n = fuv.get_required_parameters_number(explain_instance)
        if params_n != 1:
            raise RuntimeError('The explain_instance callable must accept '
                               'exactly one required parameter.')
    else:
        raise TypeError('The explain_instance should be a Python callable '
                        '(function or method).')

    is_valid = True
    return is_valid
Esempio n. 6
0
def fatf_structured_to_unstructured(
        structured_array: np.ndarray) -> np.ndarray:
    """
    Converts a structured array into a plane array of the most generic type.

    If the input arrays is purely numerical, the output array is of the most
    generic numerical type. Otherwise, the output arrays is converted to a
    string type.

    Parameters
    ----------
    structured_array : numpy.ndarray
        A structured numpy array to be converted into a plane numpy array.

    Raises
    ------
    TypeError
        The input array is not a structured numpy array.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.

    Returns
    -------
    classic_array : numpy.ndarray
        A classic numpy array representation of the ``structured_array`` with
        the most generic type out of the input array's dtypes.
    """
    if not fuav.is_structured_array(structured_array):
        raise TypeError('structured_array should be a structured numpy array.')
    if not fuav.is_base_array(structured_array):
        raise ValueError('fatf_structured_to_unstructured only supports '
                         'conversion of arrays that hold base numpy types, '
                         'i.e. numerical and string-like -- numpy void and '
                         'object-like types are not allowed.')

    if fuav.is_numerical_array(structured_array):
        dtype = np.array([i for i in structured_array[0]]).dtype
    else:
        dtype = str
    dtyped_columns = []
    # pylint: disable=len-as-condition
    assert len(structured_array.dtype.names) != 0, 'This should be structured.'
    for i in structured_array.dtype.names:
        dtyped_columns.append(structured_array[i].astype(dtype))
    classic_array = np.column_stack(dtyped_columns)
    return classic_array
Esempio n. 7
0
def _validate_input_dc(data_set: np.ndarray,
                       categorical_indices: Union[None, List[Index]],
                       neighbours: int,
                       distance_function: Union[None, DistanceFunction],
                       normalise_scores: bool) -> bool:
    """
    Validates ``DensityCheck`` class initialiser's input parameters.

    Parameters
    ----------
    data_set : numpy.ndarray
        A 2-dimensional numpy array (either classic or structured) of a base
        type.
    categorical_indices : Union[None, List[column index]],
        Either ``None`` or a list of column indices to be treated as
        categorical.
    neighbours : integer
        The number of closest neighbours to be considered.
    distance_function : Union[None, Callable[[data row, data row], number]]
        Either ``None`` or a Python function that calculates a distance between
        two data points. This function takes as an input two 1-dimensional
        numpy arrays (for classic numpy arrays) or numpy voids (fro structured
        numpy arrays) of equal length and outputs a number representing a
        distance between them. **The distance function is assumed to return the
        same distance regardless of the order in which the input parameters are
        given.**
    normalise_scores : boolean
        A boolean parameter indicating whether to normalise the scores
        (``True``) or not (``False``).

    Raises
    ------
    AttributeError
        The distance function does not require exactly 2 non-optional
        parameters.
    IncorrectShapeError
        The ``data_set`` array is not 2-dimensional.
    IndexError
        Some of the provided categorical column indices are invalid for the
        ``data_set`` array.
    TypeError
        The ``data_set`` array is not of a base type (strings and/or numbers).
        The ``neighbours`` parameter is not an integer. The
        ``distance_function`` is neither ``None`` nor Python callable (a
        function). The ``normalise_scores`` parameter is not a boolean. The
        ``categorical_indices`` parameter is not a Python list.
    ValueError
        The ``neighbours`` parameter is smaller than 1 or larger than the
        number of instances (rows) in the ``data_set`` array.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(data_set):
        raise IncorrectShapeError('The data set should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_base_array(data_set):
        raise TypeError('The data set is not of a base type (numbers and/or '
                        'strings.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                data_set, np.asarray(categorical_indices)).tolist()
            if invalid_indices:
                raise IndexError('The following indices are invalid for the '
                                 'input data set: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if isinstance(neighbours, int):
        if neighbours < 1 or neighbours > data_set.shape[0]:
            raise ValueError('The neighbours number parameter has to be '
                             'between 1 and number of data points (rows) in '
                             'the data set array.')
    else:
        raise TypeError('The neighbours number parameter has to be an '
                        'integer.')

    if distance_function is not None:
        if callable(distance_function):
            required_param_n = 0
            params = inspect.signature(distance_function).parameters
            for param in params:
                if params[param].default is params[param].empty:
                    required_param_n += 1
            if required_param_n != 2:
                raise AttributeError('The distance function must require '
                                     'exactly 2 parameters. Given function '
                                     'requires {} '
                                     'parameters.'.format(required_param_n))
        else:
            raise TypeError('The distance function should be a Python '
                            '(function).')

    if not isinstance(normalise_scores, bool):
        raise TypeError('The normalise scores parameter should be a boolean.')

    is_valid = True
    return is_valid
Esempio n. 8
0
def _validate_input(dataset: np.ndarray,
                    ground_truth: Optional[np.ndarray] = None,
                    categorical_indices: Optional[List[Index]] = None,
                    int_to_float: bool = True) -> bool:
    """
    Validates the input parameters of an arbitrary augmentation class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be used for sampling.
    ground_truth : numpy.ndarray, optional (default=None)
        A 1-dimensional numpy array with labels for the supplied dataset.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    int_to_float : boolean, optional (default=True)
        If ``True``, all of the integer dtype columns in the ``dataset`` will
        be generalised to ``numpy.float64`` type. Otherwise, integer type
        columns will remain integer and floating point type columns will remain
        floating point.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array. The
        ``ground_truth`` array is not a 1-dimensional numpy array. The number
        of ground truth annotation is different than the number of rows in the
        data array.
    IndexError
        Some of the column indices given in the ``categorical_indices``
        parameter are not valid for the input ``dataset``.
    TypeError
        The ``categorical_indices`` parameter is neither a list nor ``None``.
        The ``dataset`` or the ``ground_truth`` array (if not ``None``) are not
        of base (numerical and/or string) type. The ``int_to_float`` parameter
        is not a boolean.

    Returns
    -------
    is_valid : boolean
        ``True`` if input is valid, ``False`` otherwise.
    """
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if ground_truth is not None:
        if not fuav.is_1d_array(ground_truth):
            raise IncorrectShapeError('The ground_truth array must be '
                                      '1-dimensional. (Or None if it is not '
                                      'required.)')
        if not fuav.is_base_array(ground_truth):
            raise TypeError('The ground_truth array must be of a base type.')
        if ground_truth.shape[0] != dataset.shape[0]:
            raise IncorrectShapeError('The number of labels in the '
                                      'ground_truth array is not equal to the '
                                      'number of data points in the dataset '
                                      'array.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if not isinstance(int_to_float, bool):
        raise TypeError('The int_to_float parameter has to be a boolean.')

    is_valid = True
    return is_valid
Esempio n. 9
0
def as_unstructured(
        array_like: Union[np.ndarray, np.void],
        **kwargs: Optional[np.dtype]) -> Union[np.dtype, np.ndarray]:
    """
    Converts an array like object into an unstructured array.

    If the input array is unstructured, it is return without any
    transformations. Otherwise, if the input array is either a structured array
    or a structured array row, appropriate structured to unstructured function
    is called.

    .. warning:: Since this function either calls a local implementation or a
       builtin numpy function there may be some inconsistencies in its
       behaviour. One that we are aware of is conversion of arrays that contain
       ``'V'`` -- raw data (void), ``'O'`` -- (Python) objects, ``'M'`` --
       datetime or ``'m'`` -- timedelta dtypes. These types are not supported
       by the local implementation, however some of them are supported by the
       numpy built-in, e.g. the ``'V'`` type.

    Parameters
    ----------
    array_like : Union[numpy.ndarray, numpy.void]
        An array, a structured array or a row of a structured numpy array to be
        converted into a plane numpy array representation.
    **kwargs : Optional[numpy.dtype]
        Named parameters that are passed to the appropriate structured to
        unstructured array converter. These parameters are ignored when calling
        any of the local implementations -- see either
        :func:`fatf.utils.array.tools.structured_to_unstructured_row` or
        :func:`fatf.utils.array.tools.structured_to_unstructured` documentation
        for details.

    Raises
    ------
    TypeError
        The input array is not a numpy array, a structured numpy array or a row
        of a structured numpy array.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.

    Returns
    -------
    classic_array : Union[numpy.dtype, numpy.ndarray]
        A classic numpy array or numpy dtype (in case the structured row has
        just one element) representation of the ``structured_row`` with the
        most generic type out of the input row's dtypes.
    """
    if isinstance(array_like, np.void):
        assert fuav.is_structured_row(array_like), \
            'numpy.void has to be a row of a structured numpy array.'
        classic_array = structured_to_unstructured_row(array_like, **kwargs)
    elif isinstance(array_like, np.ndarray):
        if fuav.is_structured_array(array_like):
            classic_array = structured_to_unstructured(array_like, **kwargs)
        else:
            if fuav.is_base_array(array_like):
                classic_array = array_like
            else:
                raise ValueError('as_unstructured only supports conversion of '
                                 'arrays that hold base numpy types, i.e. '
                                 'numerical and string-like -- numpy void and '
                                 'object-like types are not allowed.')
    else:
        raise TypeError('The input should either be a numpy (structured or '
                        'unstructured) array-like object (numpy.ndarray) or a '
                        'row of a structured numpy array (numpy.void).')
    return classic_array
Esempio n. 10
0
def indices_by_type(array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Identifies indices of columns with numerical and non-numerical values.

    Checks whether a numpy array is purely numerical or a structured array
    and returns two numpy arrays: the first-one with indices of numerical
    columns and the second-one with indices of non-numerical columns.

    Parameters
    ----------
    array : numpy.ndarray
        A numpy array to be checked (it has to be a 2-dimensional array).

    Raises
    ------
    TypeError
        The input array is not a numpy array-like object.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.
    IncorrectShapeError
        The input array is not 2-dimensional.

    Returns
    -------
    numerical_indices : numpy.ndarray
        A numpy array containing indices of the numerical columns of the input
        array.
    non_numerical_indices : numpy.ndarray
        A numpy array containing indices of the non-numerical columns of the
        input array.
    """
    if not isinstance(array, np.ndarray):
        raise TypeError('The input should be a numpy array-like.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_base_array(array):
        raise ValueError('indices_by_type only supports input arrays that '
                         'hold base numpy types, i.e. numerical and '
                         'string-like -- numpy void and object-like types are '
                         'not allowed.')

    if fuav.is_structured_array(array):
        assert len(array.dtype) > 1, 'This should be a 2D array.'
        numerical_indices_list = []
        non_numerical_indices_list = []

        for column_name in array.dtype.names:
            column_dtype = array.dtype[column_name]
            if fuav.is_numerical_dtype(column_dtype):
                numerical_indices_list.append(column_name)
            else:
                non_numerical_indices_list.append(column_name)

        numerical_indices = np.array(numerical_indices_list)
        non_numerical_indices = np.array(non_numerical_indices_list)
    else:
        if fuav.is_numerical_array(array):
            numerical_indices = np.array(range(array.shape[1]))
            non_numerical_indices = np.empty((0, ), dtype='i8')
        else:
            numerical_indices = np.empty((0, ), dtype='i8')
            non_numerical_indices = np.array(range(array.shape[1]))

    return numerical_indices, non_numerical_indices
Esempio n. 11
0
def _validate_input_local_fidelity(
        dataset: np.ndarray, data_row: Union[np.ndarray, np.void],
        global_predictive_function: PredictiveFunctionType,
        local_predictive_function: PredictiveFunctionType,
        metric_function: Callable[[np.ndarray, np.ndarray], float],
        explained_class_index: Union[int, None],
        explained_feature_indices: Union[List[IndexType], None],
        fidelity_radius_percentage: int, samples_number: int) -> bool:
    """
    Validates the input parameters for the ``local_fidelity_score`` function.

    This function validates input parameter of the
    :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score`
    function. The description of this function's input parameters, errors and
    exceptions can be found therein.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type -- numbers '
                        'and/or strings.')

    if not fuav.is_1d_like(data_row):
        raise IncorrectShapeError('The data_row must either be a '
                                  '1-dimensional numpy array or a numpy '
                                  'void object for structured data rows.')

    are_similar = fuav.are_similar_dtype_arrays(dataset, np.array([data_row]))
    if not are_similar:
        raise TypeError('The dtype of the data_row is too different from '
                        'the dtype of the dataset array.')

    # If the dataset is structured and the data_row has a different
    # number of features this will be caught by the above dtype check.
    # For classic numpy arrays this has to be done separately.
    if not fuav.is_structured_array(dataset):
        if dataset.shape[1] != data_row.shape[0]:
            raise IncorrectShapeError('The data_row must contain the same '
                                      'number of features as the dataset.')

    if callable(global_predictive_function):
        global_params_n = fuv.get_required_parameters_number(
            global_predictive_function)
        if global_params_n != 1:
            raise IncompatibleModelError(
                'The global predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The global_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(local_predictive_function):
        local_params_n = fuv.get_required_parameters_number(
            local_predictive_function)
        if local_params_n != 1:
            raise IncompatibleModelError(
                'The local predictive function must have exactly *one* '
                'required parameter to work with this metric.')
    else:
        raise TypeError('The local_predictive_function should be a Python '
                        'callable, e.g., a Python function.')

    if callable(metric_function):
        if fuv.get_required_parameters_number(metric_function) != 2:
            raise TypeError('The metric_function must take exactly *two* '
                            'required parameters.')
    else:
        raise TypeError('The metric_function should be a Python callable, '
                        'e.g., a Python function.')

    # Explained class index
    global_prediction = global_predictive_function(dataset[:1])
    assert not fuav.is_structured_array(global_prediction), 'Must be plain.'
    assert global_prediction.shape[0] == 1, 'Just 1 data point was predicted.'
    if fuav.is_2d_array(global_prediction):  # A probabilistic model.
        if explained_class_index is not None:
            if isinstance(explained_class_index, int):
                if (explained_class_index >= global_prediction.shape[1]
                        or explained_class_index < 0):
                    raise ValueError('The explained_class_index parameter is '
                                     'negative or larger than the number of '
                                     'classes output by the global '
                                     'probabilistic model.')
            else:
                raise TypeError('For probabilistic global models, i.e., '
                                'global predictive functions, the '
                                'explained_class_index parameter has to be an '
                                'integer or None.')
    elif fuav.is_1d_array(global_prediction):
        if explained_class_index is not None:
            warnings.warn(
                'The explained_class_index parameter is not None and will be '
                'ignored since the global model is not probabilistic.',
                UserWarning)
    else:
        assert False, ('Global predictor must output a 1- or 2-dimensional '
                       'numpy array.')  # pragma: nocover

    if explained_feature_indices is not None:
        if isinstance(explained_feature_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(explained_feature_indices))
            if invalid_indices.size:
                raise IndexError(
                    'The following column indices are invalid for the input '
                    'dataset: {}.'.format(invalid_indices))
        else:
            raise TypeError('The explained_feature_indices parameter must be '
                            'a Python list or None.')

    if isinstance(fidelity_radius_percentage, int):
        if fidelity_radius_percentage <= 0 or fidelity_radius_percentage > 100:
            raise ValueError('The fidelity_radius_percentage must be an '
                             'integer between 1 and 100.')
    else:
        raise TypeError('The fidelity_radius_percentage must be an integer '
                        'between 1 and 100.')

    if isinstance(samples_number, int):
        if samples_number < 1:
            raise ValueError('The samples_number must be a positive integer.')
    else:
        raise TypeError('The samples_number must be an integer.')

    is_input_ok = True
    return is_input_ok
Esempio n. 12
0
def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:`fatf.transparency.data.describe_functions.\
describe_categorical_array` functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:`fatf.transparency.\
data.describe_functions.describe_numerical_array` function responsible for
        describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int], Dict[str, \
Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore
Esempio n. 13
0
def get_point_distance(
        data_array: np.ndarray, data_point: Union[np.ndarray, np.void],
        distance_function: Callable[[np.ndarray, np.ndarray], float]
) -> np.ndarray:
    """
    Computes the distance between a data point and an array of data.

    This function computes the distances between the ``data_point`` and all
    rows of the ``data_array``.

    Parameters
    ----------
    data_array : numpy.ndarray
        A 2-dimensional numpy array to which rows distances will be computed.
    data_point : Union[numpy.ndarray, numpy.void]
        A 1-dimensional numpy array or numpy void (for structured data points)
        for which distances to every row of the ``data_array`` will be
        computed.
    distance_function : Callable[[numpy.ndarray, numpy.ndarray], number]
        A Python function that takes as an input two 1-dimensional numpy arrays
        of equal length and outputs a number representing a distance between
        them. **The distance function is assumed to return the same distance
        regardless of the order in which parameters are given.**

    Raises
    ------
    AttributeError
        The distance function does not require exactly two parameters.
    IncorrectShapeError
        The data array is not a 2-dimensional numpy array. The data point is
        not 1-dimensional. The number of columns in the data array is different
        to the number of elements in the data point.
    TypeError
        The data array or the data point is not of a base type (numbers and/or
        strings). The data point and the data array have incomparable dtypes.
        The distance function is not a Python callable (function).

    Returns
    -------
    distances : numpy.ndarray
        A 1-dimensional numerical numpy array with distances between
        ``data_point`` and every row of the ``data_array``.
    """
    assert _validate_get_distance(data_array,
                                  distance_function), 'Invalid input.'

    is_structured = fuav.is_structured_array(data_array)

    if not fuav.is_1d_like(data_point):
        raise IncorrectShapeError('The data point has to be 1-dimensional '
                                  'numpy array or numpy void (for structured '
                                  'arrays).')
    data_point_array = np.asarray([data_point])
    if not fuav.is_base_array(data_point_array):
        raise TypeError('The data point has to be of a base type (strings '
                        'and/or numbers).')
    if not fuav.are_similar_dtype_arrays(data_array, data_point_array):
        raise TypeError('The dtypes of the data set and the data point are '
                        'too different.')
    # Testing only for unstructured as the dtype comparison picks up on a
    # different number of columns in a structured array
    if not is_structured:
        if data_array.shape[1] != data_point_array.shape[1]:
            raise IncorrectShapeError('The data point has different number of '
                                      'columns (features) than the data set.')

    if is_structured:
        distances = np.zeros((data_array.shape[0], ), dtype=np.float64)
        for row_i in range(data_array.shape[0]):
            distances[row_i] = distance_function(data_array[row_i], data_point)
    else:
        distances = np.apply_along_axis(distance_function, 1, data_array,
                                        data_point)

    return distances
Esempio n. 14
0
def group_by_column(
    dataset: np.ndarray,
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None
) -> Tuple[List[List[int]], List[str]]:
    """
    Groups row indices of an array based on value grouping of a chosen column.

    If selected column is numerical, by default the values are grouped into 5
    bins equally distributed between the minimum and the maximum value of the
    column. The number of bins can be changed with the
    ``numerical_bins_number`` if desired. Alternatively, the exact bin
    boundaries can be given via the ``groupings`` parameter.

    For categorical columns, the default binning is one bin for every unique
    value in the selected column. This behaviour can be changed by providing
    the ``groupings`` parameter, where multiple values can be selected to
    create one bin.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be used for grouping the row indices.
    column_index : Union[string, integer]
        A column index (a string for structured numpy arrays or an integer for
        unstructured arrays) of the column based on which the row indices will
        be partitioned.
    groupings : List[Union[number, Tuple[string]]], optional (default=None)
        A list of user-specified groupings for the selected column. The default
        grouping for categorical (textual) columns is splitting them by all the
        unique values therein. The numerical columns are, by default, binned
        into 5 bins (see the ``numerical_bins_number`` parameter) uniformly
        distributed between the minimum and the maximum value of the column.
        To introduce custom binning for a categorical column ``groupings``
        parameter should be a list of tuples, where every tuple represents a
        single group. For example, a column with the following unique values
        ``['a', 'b', 'c', 'd']`` can be split into two groups: ``['a', 'd']``
        and ``['b', 'c']`` by providing ``[('a', 'd'), ('b', 'c')]`` grouping.
        For numerical columns custom grouping should be introduced as a list of
        bucket boundaries. Every bucket includes all the values that are
        **less or equal** to the specified bucket boundary and greater than the
        previous boundary if one is given.
    numerical_bins_number : integer, optional (default=5)
        The number of bins used for default binning of numerical columns.
    treat_as_categorical : boolean, optional (default=None)
        Whether the selected column should be treated as a categorical or
        numerical feature. If set to ``None``, the type of the column will be
        inferred from the data therein. If set to ``False``, the column will be
        treated as numerical unless it is string-based in which case a warning
        will be emitted and the column will be treated as numerical despite
        this setting. Finally, if set to ``True``, the column will be treated
        as categorical.

    Warns
    -----
    UserWarning
        When grouping is done on a categorical column a warning is emitted when
        some of the values in that column are not accounted for, i.e. they are
        not included in the ``groupings`` parameter. Also, if some of the rows
        are not included in any of the groupings, a warning is shown. Missing
        row indices may be a result of some of the values being not-a-number
        for a numerical column and missing some of the unique values for a
        categorical column. ``treat_as_categorical`` parameter is set to
        ``False``, however the feature selected is string-based
        (i.e. categorical), therefore cannot be treated as a numerical one.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not 2-dimensional.
    IndexError
        The supplied ``column_index`` is not valid for the input ``dataset``.
    TypeError
        The column index is neither a string nor an integer. The numerical bins
        number is not an integer. The ``groupings`` parameter is neither a list
        not ``None``. One of the grouping bin boundaries (for a numerical
        feature column) is not a number. One of the groupings (for a
        categorical feature column) is not a tuple. The
        ``treat_as_categorical`` parameter is neither a boolean nor ``None``.
    ValueError
        The input ``dataset`` is not of a base type. The numerical bins number
        is less than 2. The ``groupings`` list is empty. The numbers in the
        ``groupings`` parameter are not monotonically increasing (for a
        numerical column). There are duplicate values shared among tuples in
        the ``grouping`` parameter or one of the values does not appear in the
        selected column (for a categorical column).

    Returns
    -------
    indices_per_bin : List[List[integer]]
        A list of lists with the latter one holding row indices of a particular
        group.
    bin_names : List[string]
        A list holding a description of each group.
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input array should be 2-dimensional.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    # Check index validity
    if isinstance(column_index, (str, int)):
        if not fuat.are_indices_valid(dataset, np.array([column_index])):
            raise IndexError('*{}* is not a valid column index for the input '
                             'dataset.'.format(column_index))
    else:
        raise TypeError('The column index can either be a string or an '
                        'integer.')

    # Check the number of numerical bins
    if isinstance(numerical_bins_number, int):
        if numerical_bins_number < 2:
            raise ValueError('The numerical_bins_number needs to be at least '
                             '2.')
    else:
        raise TypeError('The numerical_bins_number parameter has to be an '
                        'integer.')

    # Check treat_as_categorical
    if treat_as_categorical is not None:
        if not isinstance(treat_as_categorical, bool):
            raise TypeError('The treat_as_categorical parameter has to be a '
                            'boolean.')

    if fuav.is_structured_array(dataset):
        column = dataset[column_index]
    else:
        column = dataset[:, column_index]
    assert fuav.is_1d_array(column), 'This must be a 1D numpy array.'

    # Get a list of all the row indices
    all_row_indices = set(range(column.shape[0]))

    indices_per_bin = []
    bin_names = []

    is_numerical_column = fuav.is_numerical_array(column)
    is_categorical_column = fuav.is_textual_array(column)
    assert is_numerical_column is not is_categorical_column, \
        'The column must be a base array.'

    # Sort out numerical/categorical column treatment
    if treat_as_categorical is None:
        go_numerical = is_numerical_column
    else:
        if treat_as_categorical:
            go_numerical = False
        else:  # Treat as numerical
            if is_numerical_column:
                go_numerical = True
            else:  # Is not numerical
                warnings.warn(
                    'Selected feature is categorical, therefore cannot be '
                    'treated as numerical. The feature will be treated as '
                    'categorical despite the treat_as_categorical parameter '
                    'set to False.', UserWarning)
                go_numerical = False

    if go_numerical:
        if groupings is None:
            # Get default bins
            bins = np.linspace(column.min(),
                               column.max(),
                               num=numerical_bins_number,
                               endpoint=False)[1:].tolist()
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A numerical grouping list has to contain at '
                                 'least one element.')

            # Every element in the groupings list must be a number
            for i, number in enumerate(groupings):
                if not isinstance(number, Number):
                    raise TypeError('For a numerical column all of the '
                                    'grouping items must be numbers. *{}* '
                                    'is not a number.'.format(number))
                if i != 0:
                    if number <= groupings[i - 1]:
                        raise ValueError('The numbers in the groupings list '
                                         'have to be monotonically '
                                         'increasing.')
            bins = groupings
        else:
            raise TypeError('Since a numerical column was chosen the grouping '
                            'must be a list of bin boundaries or None.')

        lower_edge = 'x <= {}'
        middle = '{} < x <= {}'
        upper_edge = '{} < x'

        indices_seen_so_far = set()  # type: Set[int]

        for i, edge in enumerate(bins):
            if i == 0:
                indices = np.where(column <= edge)[0].tolist()

                indices_per_bin.append(indices)
                bin_names.append(lower_edge.format(edge))
            else:
                edge_lower = bins[i - 1]

                indices_l = set(np.where(column <= edge)[0].tolist())
                indices_u = set(np.where(column > edge_lower)[0].tolist())
                indices = list(indices_l.intersection(indices_u))

                indices_per_bin.append(indices)
                bin_names.append(middle.format(edge_lower, edge))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

        assert bins, 'If bins is empty, i and edge will not be defined.'
        # pylint: disable=undefined-loop-variable
        indices = np.where(column > edge)[0].tolist()

        indices_per_bin.append(indices)
        bin_names.append(upper_edge.format(edge))

        assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
        indices_seen_so_far = indices_seen_so_far.union(indices)
    else:
        unique_elements = np.sort(np.unique(column)).tolist()

        if groupings is None:
            bins = [(i, ) for i in unique_elements]
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A categorical grouping list has to contain '
                                 'at least one element.')

            values_seen_so_far = set()  # type: Set[str]

            # Every element in the groupings list must be a valid tuple
            for value_tuple in groupings:
                if not isinstance(value_tuple, tuple):
                    raise TypeError('For a categorical column all of the '
                                    'grouping items must be tuples. *{}* '
                                    'is not a tuple.'.format(value_tuple))
                for value in value_tuple:
                    if value not in unique_elements:
                        raise ValueError('*{}* value is not present in the '
                                         'selected column.'.format(value))

                if values_seen_so_far.intersection(value_tuple):
                    raise ValueError('Some values are duplicated across '
                                     'tuples.')
                values_seen_so_far = values_seen_so_far.union(value_tuple)

            unaccounted_values = set(unique_elements).difference(
                values_seen_so_far)
            if unaccounted_values:
                warnings.warn(
                    'The following values in the selected column were not '
                    'accounted for in the grouping '
                    'tuples:\n{}.'.format(unaccounted_values), UserWarning)

            bins = [tuple(sorted(i)) for i in groupings]  # type: ignore
            bins = sorted(bins)
        else:
            raise TypeError('Since a categorical column was chosen the '
                            'grouping must be a list of tuples representing '
                            'categorical values grouping or None for the '
                            'default grouping.')

        indices_seen_so_far = set()

        for bin_values in bins:
            indices = set()
            for value in bin_values:
                vid = np.where(column == value)[0].tolist()
                indices = indices.union(vid)

            indices_per_bin.append(list(indices))
            bin_names.append('{}'.format(bin_values))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

    # Validate that all of the row indices were accounted for
    missed_indices = all_row_indices.difference(indices_seen_so_far)
    if missed_indices:
        warnings.warn(
            'The following row indices could not be accounted for:\n{}.\n For '
            'a numerical column there may have been some numpy.nan therein. '
            'For a categorical column some of the column values were probably '
            'not specified in the grouping, in which case there should be a '
            'separate user warning.'.format(missed_indices), UserWarning)

    return indices_per_bin, bin_names
def _validate_input_discretiser(
        dataset: np.ndarray,
        categorical_indices: Optional[List[Index]] = None,
        feature_names: Optional[List[str]] = None) -> bool:
    """
    Validates the input parameters of an arbitrary discretiser class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be discretised.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    feature_names : List[strings], optional (default=None)
        A list of feature names in order they appear in the ``dataset`` array.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array.
    IndexError
        Some of the column indices given in the ``categorical_indices`` list
        are invalid for the input ``dataset``.
    TypeError
        The ``dataset`` is not of a base (numerical and/or string) type.
        The ``categorical_indices`` is neither a Python list nor ``None``.
        The ``feature_names`` is neither a Python list nor ``None`` or one of
        its elements (if it is a list) is not a string.
    ValueError
        The length of the ``feature_names`` list is different than the number
        of columns (features) in the input ``dataset``.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(
                                     invalid_indices.tolist()))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if feature_names is not None:
        if isinstance(feature_names, list):
            if fuav.is_structured_array(dataset):
                features_number = len(dataset.dtype.names)
            else:
                features_number = dataset.shape[1]
            if len(feature_names) != features_number:
                raise ValueError('The length of feature_names list must be '
                                 'equal to the number of features (columns) '
                                 'in the input dataset.')

            for name in feature_names:
                if not isinstance(name, str):
                    raise TypeError('All of the feature_names must be '
                                    'strings. The *{}* feature name is not a '
                                    'string.'.format(name))
        else:
            raise TypeError('The feature_names parameter must be a Python '
                            'list or None.')

    is_valid = True
    return is_valid