예제 #1
0
def test_discretizer(cats_and_percentiles):
    cat, perc = cats_and_percentiles
    disc = Discretizer(x, cat, feature_names, perc)
    to_disc = list(disc.names.keys())
    assert len(to_disc) == (x.shape[1] - len(cat))
    x_disc = disc.discretize(x)
    for k, v in disc.names.items():
        assert len(v) <= len(perc) + 1
        assert callable(disc.lambdas[k])
        assert (x_disc[:, k].min() == 0).all()
        assert (x_disc[:, k].max() == len(perc)).all()

    for i in range(x.shape[1]):
        if i not in to_disc:
            assert (x_disc[:, i] == x[:, i]).all()
예제 #2
0
def inject_outlier_categorical(X: np.ndarray,
                               cols: List[int],
                               perc_outlier: int,
                               y: np.ndarray = None,
                               cat_perturb: dict = None,
                               X_fit: np.ndarray = None,
                               disc_perc: list = [25, 50, 75],
                               smooth: float = 1.) -> Bunch:
    """
    Inject outliers in categorical variables of tabular data.

    Parameters
    ----------
    X
        Tabular data with categorical variables to perturb (inject outliers).
    cols
        Columns of X that are categorical and can be perturbed.
    perc_outlier
        Percentage of observations which are perturbed to outliers. For multiple numerical features,
        the percentage is evenly split across the features.
    y
        Outlier labels.
    cat_perturb
        Dictionary mapping each category in the categorical variables to their furthest neighbour.
    X_fit
        Optional data used to infer pairwise distances from.
    disc_perc
        List with percentiles used in binning of numerical features used for the 'abdm' pairwise distance measure.
    smooth
        Smoothing exponent between 0 and 1 for the distances.
        Lower values will smooth the difference in distance metric between different features.

    Returns
    -------
    Bunch object with the perturbed tabular data, outlier labels and
    a dictionary used to map categories to their furthest neighbour.
    """
    if cat_perturb is None:
        # transform the categorical variables into numerical ones via
        # pairwise distances computed with abdm and multidim scaling
        X_fit = X.copy() if X_fit is None else X_fit

        # find number of categories for each categorical variable
        cat_vars = {k: None for k in cols}
        for k in cols:
            cat_vars[k] = len(np.unique(X_fit[:, k]))  # type: ignore

        # TODO: extend method for OHE
        ohe = False
        if ohe:
            X_ord, cat_vars_ord = ohe2ord(X, cat_vars)
        else:
            X_ord, cat_vars_ord = X, cat_vars

        # bin numerical features to compute the pairwise distance matrices
        n_ord = X_ord.shape[1]
        if len(cols) != n_ord:
            fnames = [str(_) for _ in range(n_ord)]
            disc = Discretizer(X_ord, cols, fnames, percentiles=disc_perc)
            X_bin = disc.discretize(X_ord)
            cat_vars_bin = {
                k: len(disc.names[k])
                for k in range(n_ord) if k not in cols
            }
        else:
            X_bin = X_ord
            cat_vars_bin = {}

        # pairwise distances for categorical variables
        d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin)

        # multidim scaling
        feature_range = (np.ones((1, n_ord)) * -1e10, np.ones(
            (1, n_ord)) * 1e10)
        d_abs = multidim_scaling(d_pair,
                                 n_components=2,
                                 use_metric=True,
                                 standardize_cat_vars=True,
                                 smooth=smooth,
                                 feature_range=feature_range,
                                 update_feature_range=False)[0]

        # find furthest category away for each category in the categorical variables
        cat_perturb = {k: np.zeros(len(v)) for k, v in d_abs.items()}
        for k, v in d_abs.items():
            for i in range(len(v)):
                cat_perturb[k][i] = np.argmax(np.abs(v[i] - v))
    else:
        d_abs = None

    n_dim = len(X.shape)
    if n_dim == 1:
        X = X.reshape(-1, 1)
    n_samples, n_features = X.shape
    X_outlier = X.astype(np.float32).copy()
    if y is None:
        is_outlier = np.zeros(n_samples)
    else:
        is_outlier = y
    n_cols = len(cols)

    # distribute outliers evenly over different columns
    n_outlier = int(n_samples * perc_outlier * .01 / n_cols)
    for col in cols:
        outlier_idx = np.sort(random.sample(range(n_samples), n_outlier))
        col_cat = X_outlier[outlier_idx, col].astype(int)
        col_map = np.tile(cat_perturb[col], (n_outlier, 1))
        X_outlier[outlier_idx, col] = np.diag(col_map.T[col_cat])
        is_outlier[outlier_idx] = 1
    if n_dim == 1:
        X_outlier = X_outlier.reshape(n_samples, )
    return Bunch(data=X_outlier,
                 target=is_outlier,
                 cat_perturb=cat_perturb,
                 d_abs=d_abs,
                 target_names=['normal', 'outlier'])
예제 #3
0
    def fit(self,
            X: np.ndarray,
            y: np.ndarray = None,
            d_type: str = 'abdm',
            w: float = None,
            disc_perc: list = [25, 50, 75],
            standardize_cat_vars: bool = True,
            feature_range: tuple = (-1e10, 1e10),
            smooth: float = 1.,
            center: bool = True
            ) -> None:
        """
        If categorical variables are present, then transform those to numerical values.
        This step is not necessary in the absence of categorical variables.

        Parameters
        ----------
        X
            Batch of instances used to infer distances between categories from.
        y
            Model class predictions or ground truth labels for X.
            Used for 'mvdm' and 'abdm-mvdm' pairwise distance metrics.
            Note that this is only compatible with classification problems. For regression problems,
            use the 'abdm' distance metric.
        d_type
            Pairwise distance metric used for categorical variables. Currently, 'abdm', 'mvdm' and 'abdm-mvdm'
            are supported. 'abdm' infers context from the other variables while 'mvdm' uses the model predictions.
            'abdm-mvdm' is a weighted combination of the two metrics.
        w
            Weight on 'abdm' (between 0. and 1.) distance if d_type equals 'abdm-mvdm'.
        disc_perc
            List with percentiles used in binning of numerical features used for the 'abdm'
            and 'abdm-mvdm' pairwise distance measures.
        standardize_cat_vars
            Standardize numerical values of categorical variables if True.
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1x nb of features) for feature-wise ranges.
        smooth
            Smoothing exponent between 0 and 1 for the distances. Lower values of l will smooth the difference in
            distance metric between different features.
        center
            Whether to center the scaled distance measures. If False, the min distance for each feature
            except for the feature with the highest raw max distance will be the lower bound of the
            feature range, but the upper bound will be below the max feature range.
        """
        if self.cat_vars is None:
            raise TypeError('No categorical variables specified in the "cat_vars" argument.')

        if d_type not in ['abdm', 'mvdm', 'abdm-mvdm']:
            raise ValueError('d_type needs to be "abdm", "mvdm" or "abdm-mvdm". '
                             '{} is not supported.'.format(d_type))

        if self.ohe:
            X_ord, cat_vars_ord = ohe2ord(X, self.cat_vars)
        else:
            X_ord, cat_vars_ord = X, self.cat_vars

        # bin numerical features to compute the pairwise distance matrices
        cat_keys = list(cat_vars_ord.keys())
        n_ord = X_ord.shape[1]
        if d_type in ['abdm', 'abdm-mvdm'] and len(cat_keys) != n_ord:
            fnames = [str(_) for _ in range(n_ord)]
            disc = Discretizer(X_ord, cat_keys, fnames, percentiles=disc_perc)
            X_bin = disc.discretize(X_ord)
            cat_vars_bin = {k: len(disc.names[k]) for k in range(n_ord) if k not in cat_keys}
        else:
            X_bin = X_ord
            cat_vars_bin = {}

        # pairwise distances for categorical variables
        if d_type == 'abdm':
            d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin)
        elif d_type == 'mvdm':
            d_pair = mvdm(X_ord, y, cat_vars_ord, alpha=1)

        if (type(feature_range[0]) == type(feature_range[1]) and  # noqa
                type(feature_range[0]) in [int, float]):
            feature_range = (np.ones((1, n_ord)) * feature_range[0],
                             np.ones((1, n_ord)) * feature_range[1])

        if d_type == 'abdm-mvdm':
            # pairwise distances
            d_abdm = abdm(X_bin, cat_vars_ord, cat_vars_bin)
            d_mvdm = mvdm(X_ord, y, cat_vars_ord, alpha=1)

            # multidim scaled distances
            d_abs_abdm = multidim_scaling(d_abdm, n_components=2, use_metric=True,
                                          feature_range=feature_range,
                                          standardize_cat_vars=standardize_cat_vars,
                                          smooth=smooth, center=center,
                                          update_feature_range=False)[0]

            d_abs_mvdm = multidim_scaling(d_mvdm, n_components=2, use_metric=True,
                                          feature_range=feature_range,
                                          standardize_cat_vars=standardize_cat_vars,
                                          smooth=smooth, center=center,
                                          update_feature_range=False)[0]

            # combine abdm and mvdm
            for k, v in d_abs_abdm.items():
                self.d_abs[k] = v * w + d_abs_mvdm[k] * (1 - w)
                if center:  # center the numerical feature values
                    self.d_abs[k] -= .5 * (self.d_abs[k].max() + self.d_abs[k].min())
        else:
            self.d_abs = multidim_scaling(d_pair, n_components=2, use_metric=True,
                                          feature_range=feature_range,
                                          standardize_cat_vars=standardize_cat_vars,
                                          smooth=smooth, center=center,
                                          update_feature_range=False)[0]