def __init__(self,
                 dataset: np.ndarray,
                 categorical_indices: Optional[List[Index]] = None,
                 feature_names: Optional[List[str]] = None) -> None:
        """
        Constructs a ``Discretiser`` abstract class.
        """
        # Must be overwritten in children classes
        self.feature_value_names = {}  # type: Dict[Index, Dict[int, str]]
        self.feature_bin_boundaries = {}  # type: Dict[Index, np.ndarray]

        assert _validate_input_discretiser(
            dataset,
            categorical_indices=categorical_indices,
            feature_names=feature_names), 'Invalid input.'

        self.is_structured = fuav.is_structured_array(dataset)

        self.dataset_dtype = dataset.dtype

        # Sort out column indices
        indices_num, indices_cat = fuat.indices_by_type(dataset)
        num_indices = set(indices_num)
        cat_indices = set(indices_cat)
        all_indices = num_indices.union(cat_indices)

        if categorical_indices is None:
            categorical_indices = cat_indices  # type: ignore
            numerical_indices = num_indices
        else:
            if cat_indices.difference(categorical_indices):
                msg = ('Some of the string-based columns in the input dataset '
                       'were not selected as categorical features via the '
                       'categorical_indices parameter. String-based columns '
                       'cannot be treated as numerical features, therefore '
                       'they will be also treated as categorical features '
                       '(in addition to the ones selected with the '
                       'categorical_indices parameter).')
                warnings.warn(msg, UserWarning)
                categorical_indices = cat_indices.union(  # type: ignore
                    categorical_indices)
            numerical_indices = all_indices.difference(categorical_indices)
        self.categorical_indices = sorted(
            list(categorical_indices))  # type: ignore
        self.numerical_indices = sorted(list(numerical_indices))

        self.features_number = len(all_indices)

        if self.is_structured:
            indices = self.dataset_dtype.names
        else:
            indices = range(self.features_number)

        if feature_names is None:
            feature_names_map = {x: str(x) for x in indices}
        else:
            feature_names_map = dict(zip(indices, feature_names))
        self.feature_names_map = feature_names_map
Esempio n. 2
0
    def __init__(self,
                 data_set: np.ndarray,
                 categorical_indices: Optional[List[Index]] = None,
                 neighbours: int = 7,
                 distance_function: Optional[DistanceFunction] = None,
                 normalise_scores: bool = True) -> None:
        """
        Initialises the ``DensityCheck`` class.
        """
        # pylint: disable=too-many-arguments
        assert _validate_input_dc(data_set, categorical_indices, neighbours,
                                  distance_function,
                                  normalise_scores), 'Invalid input.'

        self.data_set = data_set
        self._is_structured = fuav.is_structured_array(self.data_set)
        #
        self.neighbours = neighbours
        if distance_function is None:
            if not _NUMPY_1_14 and self._is_structured:
                distance_function = self._mixed_distance_o  # pragma: nocover
            else:
                distance_function = self._mixed_distance_n
        self._distance_function = distance_function  # type: ignore
        #
        self.normalise_scores = normalise_scores

        # Sort out column indices
        feature_indices = fuat.indices_by_type(self.data_set)
        num_indices = set(feature_indices[0])
        cat_indices = set(feature_indices[1])
        all_indices = num_indices.union(cat_indices)
        if categorical_indices is None:
            _categorical_indices = cat_indices
            _numerical_indices = num_indices
        else:
            if cat_indices.difference(categorical_indices):
                msg = ('Some of the string-based columns in the input data '
                       'set were not selected as categorical features via the '
                       'categorical_indices parameter. String-based columns '
                       'cannot be treated as numerical features, therefore '
                       'they will be also treated as categorical features '
                       '(in addition to the ones selected with the '
                       'categorical_indices parameter).')
                warnings.warn(msg, UserWarning)
                _categorical_indices = cat_indices.union(categorical_indices)
            else:
                _categorical_indices = categorical_indices  # type: ignore
            _numerical_indices = all_indices.difference(_categorical_indices)
        self._categorical_indices = sorted(list(_categorical_indices))
        self._numerical_indices = sorted(list(_numerical_indices))

        self._samples_number = self.data_set.shape[0]

        self.distance_matrix = fud.get_distance_matrix(self.data_set,
                                                       self._distance_function)
        assert self._samples_number == self.distance_matrix.shape[0]
        assert self.distance_matrix.shape[0] == self.distance_matrix.shape[1]

        self.scores = self._compute_scores()
        assert self._samples_number == self.scores.shape[0]
        self.scores_min = self.scores.min()
        self.scores_max = self.scores.max()
        if self.normalise_scores:
            if self.scores_min == self.scores_max:
                assert (self.scores == self.scores_min).all(), \
                    'All distances/scores are equal.'
                self.scores[:] = 0
            else:
                self.scores -= self.scores_min
                self.scores /= self.scores_max - self.scores_min
Esempio n. 3
0
    def __init__(self,
                 dataset: np.ndarray,
                 ground_truth: Optional[np.ndarray] = None,
                 categorical_indices: Optional[np.ndarray] = None,
                 int_to_float: bool = True) -> None:
        """
        Constructs an ``Augmentation`` abstract class.
        """
        # pylint: disable=too-many-locals
        assert _validate_input(dataset,
                               ground_truth=ground_truth,
                               categorical_indices=categorical_indices,
                               int_to_float=int_to_float), 'Invalid input.'

        self.dataset = dataset
        self.data_points_number = dataset.shape[0]
        self.is_structured = fuav.is_structured_array(dataset)

        self.ground_truth = ground_truth

        # Sort out column indices
        indices = fuat.indices_by_type(dataset)
        num_indices = set(indices[0])
        cat_indices = set(indices[1])
        all_indices = num_indices.union(cat_indices)

        if categorical_indices is None:
            categorical_indices = cat_indices
            numerical_indices = num_indices
        else:
            if cat_indices.difference(categorical_indices):
                msg = ('Some of the string-based columns in the input dataset '
                       'were not selected as categorical features via the '
                       'categorical_indices parameter. String-based columns '
                       'cannot be treated as numerical features, therefore '
                       'they will be also treated as categorical features '
                       '(in addition to the ones selected with the '
                       'categorical_indices parameter).')
                warnings.warn(msg, UserWarning)
                categorical_indices = cat_indices.union(categorical_indices)
            numerical_indices = all_indices.difference(categorical_indices)

        self.categorical_indices = sorted(list(categorical_indices))
        self.numerical_indices = sorted(list(numerical_indices))
        self.features_number = len(all_indices)

        # Sort out the dtype of the sampled array.
        ntype = np.dtype(np.float64) if int_to_float else np.dtype(np.int64)
        if self.is_structured:
            sample_dtype = []
            for column_name in self.dataset.dtype.names:
                if column_name in self.numerical_indices:
                    new_dtype = fuat.generalise_dtype(
                        self.dataset.dtype[column_name], ntype)
                    sample_dtype.append((column_name, new_dtype))
                elif column_name in self.categorical_indices:
                    sample_dtype.append(
                        (column_name, self.dataset.dtype[column_name]))
                else:
                    assert False, 'Unknown column name.'  # pragma: nocover
        else:
            if fuav.is_numerical_array(self.dataset):
                sample_dtype = fuat.generalise_dtype(self.dataset.dtype, ntype)
            else:
                sample_dtype = self.dataset.dtype
        self.sample_dtype = sample_dtype
Esempio n. 4
0
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Fits the model.

        Parameters
        ----------
        X : numpy.ndarray
            The KNN training data.
        y : numpy.ndarray
            The KNN training labels.

        Raises
        ------
        IncorrectShapeError
            Either the ``X`` array is not 2-dimensional, the ``y`` array is not
            1-dimensional, the number of rows in ``X`` is not the same as the
            number of elements in ``y`` or the ``X`` array has 0 rows or 0
            columns.
        PrefittedModelError
            Trying to fit the model when it has already been fitted. Usually
            raised when calling the ``fit`` method for the second time without
            clearing the model first.
        TypeError
            Trying to fit a KNN predictor in a regressor mode with
            non-numerical target variable.
        """
        if self._is_fitted:
            raise PrefittedModelError('This model has already been fitted.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('The training data must be a 2-'
                                      'dimensional array.')
        if not fuav.is_1d_array(y):
            raise IncorrectShapeError('The training data labels must be a 1-'
                                      'dimensional array.')
        if X.shape[0] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one data point.')
        # If the array is structured the fuav.is_2d_array function takes care
        # of checking whether there is at least one column
        if not fuav.is_structured_array(X) and X.shape[1] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one feature.')
        if X.shape[0] != y.shape[0]:
            raise IncorrectShapeError('The number of samples in X must be the '
                                      'same as the number of labels in y.')
        if not self._is_classifier and not fuav.is_numerical_array(y):
            raise TypeError('Regressor can only be fitted for a numerical '
                            'target vector.')

        numerical_indices, categorical_indices = fuat.indices_by_type(X)
        self._numerical_indices = numerical_indices
        self._categorical_indices = categorical_indices

        self._is_structured = fuav.is_structured_array(X)
        self._X = X
        self._y = y

        if self._is_classifier:
            unique_y, unique_y_counts = np.unique(self._y, return_counts=True)
            # Order labels lexicographically.
            unique_y_sort_index = np.argsort(unique_y)
            self._unique_y = unique_y[unique_y_sort_index]
            self._unique_y_counts = unique_y_counts[unique_y_sort_index]

            # How many other labels have the same count.
            top_y_index = self._unique_y_counts == np.max(
                self._unique_y_counts)
            top_y_unique_sorted = np.sort(self._unique_y[top_y_index])
            self._majority_label = top_y_unique_sorted[0]

            self._unique_y_probabilities = (self._unique_y_counts /
                                            self._y.shape[0])
        else:
            self._majority_label = self._y.mean()
            self._unique_y = np.ndarray((0, ))
            self._unique_y_counts = np.ndarray((0, ))
            self._unique_y_probabilities = np.ndarray((0, ))

        self._X_n = self._X.shape[0]
        self._is_fitted = True
Esempio n. 5
0
def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:`fatf.transparency.data.describe_functions.\
describe_categorical_array` functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:`fatf.transparency.\
data.describe_functions.describe_numerical_array` function responsible for
        describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int], Dict[str, \
Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore
def test_indices_by_type():
    """
    Tests :func:`fatf.utils.array.tools.indices_by_type` function.
    """
    # pylint: disable=too-many-locals,too-many-statements
    # Test any object and shape
    type_error = 'The input should be a numpy array-like.'
    incorrect_shape_error = 'The input array should be 2-dimensional.'
    value_error = ('indices_by_type only supports input arrays that hold base '
                   'numpy types, i.e. numerical and string-like -- numpy void '
                   'and object-like types are not allowed.')
    with pytest.raises(TypeError) as exin:
        fuat.indices_by_type(None)
    assert str(exin.value) == type_error
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.indices_by_type(np.empty((0, )))
    assert str(exin.value) == incorrect_shape_error
    with pytest.raises(ValueError) as exin:
        fuat.indices_by_type(NOT_NUMERICAL_NP_ARRAY)
    assert str(exin.value) == value_error

    # Empty array
    i_n, i_c = fuat.indices_by_type(np.empty((22, 0)))
    assert np.array_equal([], i_n)
    assert np.array_equal([], i_c)

    # All numerical array
    array_all_numerical = np.ones((22, 4))
    array_all_numerical_indices_numerical = np.array([0, 1, 2, 3])
    array_all_numerical_indices_categorical = np.array([], dtype=int)
    i_n, i_c = fuat.indices_by_type(array_all_numerical)
    assert np.array_equal(array_all_numerical_indices_numerical, i_n)
    assert np.array_equal(array_all_numerical_indices_categorical, i_c)

    # All categorical -- single type -- array
    array_all_categorical = np.ones((22, 4), dtype='U4')
    array_all_categorical_indices_numerical = np.array([])
    array_all_categorical_indices_categorical = np.array([0, 1, 2, 3])
    i_n, i_c = fuat.indices_by_type(array_all_categorical)
    assert np.array_equal(array_all_categorical_indices_numerical, i_n)
    assert np.array_equal(array_all_categorical_indices_categorical, i_c)

    # Mixture array
    array_mixture_1 = np.ones((22, ), dtype=[('a', 'U4'),
                                             ('b', 'U4'),
                                             ('c', 'U4'),
                                             ('d', 'U4')])  # yapf: disable
    array_mixture_1_indices_numerical = np.array([])
    array_mixture_1_indices_categorical = np.array(['a', 'b', 'c', 'd'],
                                                   dtype='U1')
    ####
    i_n, i_c = fuat.indices_by_type(array_mixture_1)
    assert np.array_equal(array_mixture_1_indices_numerical, i_n)
    assert np.array_equal(array_mixture_1_indices_categorical, i_c)

    array_mixture_2 = np.ones((22, ), dtype=[('a', 'U4'),
                                             ('b', 'f'),
                                             ('c', 'U4'),
                                             ('d', int)])  # yapf: disable
    array_mixture_2_indices_numerical = np.array(['b', 'd'], dtype='U1')
    array_mixture_2_indices_categorical = np.array(['a', 'c'], dtype='U1')
    i_n, i_c = fuat.indices_by_type(array_mixture_2)
    assert np.array_equal(array_mixture_2_indices_numerical, i_n)
    assert np.array_equal(array_mixture_2_indices_categorical, i_c)

    glob_indices_numerical = np.array([0, 1])
    glob_indices_categorical = np.array([])
    i_n, i_c = fuat.indices_by_type(NUMERICAL_NP_ARRAY)
    assert np.array_equal(glob_indices_numerical, i_n)
    assert np.array_equal(glob_indices_categorical, i_c)
    #
    glob_indices_numerical = np.array([0, 1, 2])
    glob_indices_categorical = np.array([])
    i_n, i_c = fuat.indices_by_type(WIDE_NP_ARRAY)
    assert np.array_equal(glob_indices_numerical, i_n)
    assert np.array_equal(glob_indices_categorical, i_c)
    #
    glob_indices_numerical = np.array(['numbers', 'complex'])
    glob_indices_categorical = np.array([])
    i_n, i_c = fuat.indices_by_type(NUMERICAL_STRUCTURED_ARRAY)
    assert np.array_equal(glob_indices_numerical, i_n)
    assert np.array_equal(glob_indices_categorical, i_c)
    #
    glob_indices_numerical = np.array(['numerical'])
    glob_indices_categorical = np.array(['categorical'])
    i_n, i_c = fuat.indices_by_type(NOT_NUMERICAL_STRUCTURED_ARRAY)
    assert np.array_equal(glob_indices_numerical, i_n)
    assert np.array_equal(glob_indices_categorical, i_c)
    #
    glob_indices_numerical = np.array(['numbers', 'complex', 'anybody'])
    glob_indices_categorical = np.array([])
    i_n, i_c = fuat.indices_by_type(WIDE_STRUCTURED_ARRAY)
    assert np.array_equal(glob_indices_numerical, i_n)
    assert np.array_equal(glob_indices_categorical, i_c)