def test_discretizer(cats_and_percentiles): cat, perc = cats_and_percentiles disc = Discretizer(x, cat, feature_names, perc) to_disc = list(disc.names.keys()) assert len(to_disc) == (x.shape[1] - len(cat)) x_disc = disc.discretize(x) for k, v in disc.names.items(): assert len(v) <= len(perc) + 1 assert callable(disc.lambdas[k]) assert (x_disc[:, k].min() == 0).all() assert (x_disc[:, k].max() == len(perc)).all() for i in range(x.shape[1]): if i not in to_disc: assert (x_disc[:, i] == x[:, i]).all()
def inject_outlier_categorical(X: np.ndarray, cols: List[int], perc_outlier: int, y: np.ndarray = None, cat_perturb: dict = None, X_fit: np.ndarray = None, disc_perc: list = [25, 50, 75], smooth: float = 1.) -> Bunch: """ Inject outliers in categorical variables of tabular data. Parameters ---------- X Tabular data with categorical variables to perturb (inject outliers). cols Columns of X that are categorical and can be perturbed. perc_outlier Percentage of observations which are perturbed to outliers. For multiple numerical features, the percentage is evenly split across the features. y Outlier labels. cat_perturb Dictionary mapping each category in the categorical variables to their furthest neighbour. X_fit Optional data used to infer pairwise distances from. disc_perc List with percentiles used in binning of numerical features used for the 'abdm' pairwise distance measure. smooth Smoothing exponent between 0 and 1 for the distances. Lower values will smooth the difference in distance metric between different features. Returns ------- Bunch object with the perturbed tabular data, outlier labels and a dictionary used to map categories to their furthest neighbour. """ if cat_perturb is None: # transform the categorical variables into numerical ones via # pairwise distances computed with abdm and multidim scaling X_fit = X.copy() if X_fit is None else X_fit # find number of categories for each categorical variable cat_vars = {k: None for k in cols} for k in cols: cat_vars[k] = len(np.unique(X_fit[:, k])) # type: ignore # TODO: extend method for OHE ohe = False if ohe: X_ord, cat_vars_ord = ohe2ord(X, cat_vars) else: X_ord, cat_vars_ord = X, cat_vars # bin numerical features to compute the pairwise distance matrices n_ord = X_ord.shape[1] if len(cols) != n_ord: fnames = [str(_) for _ in range(n_ord)] disc = Discretizer(X_ord, cols, fnames, percentiles=disc_perc) X_bin = disc.discretize(X_ord) cat_vars_bin = { k: len(disc.names[k]) for k in range(n_ord) if k not in cols } else: X_bin = X_ord cat_vars_bin = {} # pairwise distances for categorical variables d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin) # multidim scaling feature_range = (np.ones((1, n_ord)) * -1e10, np.ones( (1, n_ord)) * 1e10) d_abs = multidim_scaling(d_pair, n_components=2, use_metric=True, standardize_cat_vars=True, smooth=smooth, feature_range=feature_range, update_feature_range=False)[0] # find furthest category away for each category in the categorical variables cat_perturb = {k: np.zeros(len(v)) for k, v in d_abs.items()} for k, v in d_abs.items(): for i in range(len(v)): cat_perturb[k][i] = np.argmax(np.abs(v[i] - v)) else: d_abs = None n_dim = len(X.shape) if n_dim == 1: X = X.reshape(-1, 1) n_samples, n_features = X.shape X_outlier = X.astype(np.float32).copy() if y is None: is_outlier = np.zeros(n_samples) else: is_outlier = y n_cols = len(cols) # distribute outliers evenly over different columns n_outlier = int(n_samples * perc_outlier * .01 / n_cols) for col in cols: outlier_idx = np.sort(random.sample(range(n_samples), n_outlier)) col_cat = X_outlier[outlier_idx, col].astype(int) col_map = np.tile(cat_perturb[col], (n_outlier, 1)) X_outlier[outlier_idx, col] = np.diag(col_map.T[col_cat]) is_outlier[outlier_idx] = 1 if n_dim == 1: X_outlier = X_outlier.reshape(n_samples, ) return Bunch(data=X_outlier, target=is_outlier, cat_perturb=cat_perturb, d_abs=d_abs, target_names=['normal', 'outlier'])
def fit(self, X: np.ndarray, y: np.ndarray = None, d_type: str = 'abdm', w: float = None, disc_perc: list = [25, 50, 75], standardize_cat_vars: bool = True, feature_range: tuple = (-1e10, 1e10), smooth: float = 1., center: bool = True ) -> None: """ If categorical variables are present, then transform those to numerical values. This step is not necessary in the absence of categorical variables. Parameters ---------- X Batch of instances used to infer distances between categories from. y Model class predictions or ground truth labels for X. Used for 'mvdm' and 'abdm-mvdm' pairwise distance metrics. Note that this is only compatible with classification problems. For regression problems, use the 'abdm' distance metric. d_type Pairwise distance metric used for categorical variables. Currently, 'abdm', 'mvdm' and 'abdm-mvdm' are supported. 'abdm' infers context from the other variables while 'mvdm' uses the model predictions. 'abdm-mvdm' is a weighted combination of the two metrics. w Weight on 'abdm' (between 0. and 1.) distance if d_type equals 'abdm-mvdm'. disc_perc List with percentiles used in binning of numerical features used for the 'abdm' and 'abdm-mvdm' pairwise distance measures. standardize_cat_vars Standardize numerical values of categorical variables if True. feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1x nb of features) for feature-wise ranges. smooth Smoothing exponent between 0 and 1 for the distances. Lower values of l will smooth the difference in distance metric between different features. center Whether to center the scaled distance measures. If False, the min distance for each feature except for the feature with the highest raw max distance will be the lower bound of the feature range, but the upper bound will be below the max feature range. """ if self.cat_vars is None: raise TypeError('No categorical variables specified in the "cat_vars" argument.') if d_type not in ['abdm', 'mvdm', 'abdm-mvdm']: raise ValueError('d_type needs to be "abdm", "mvdm" or "abdm-mvdm". ' '{} is not supported.'.format(d_type)) if self.ohe: X_ord, cat_vars_ord = ohe2ord(X, self.cat_vars) else: X_ord, cat_vars_ord = X, self.cat_vars # bin numerical features to compute the pairwise distance matrices cat_keys = list(cat_vars_ord.keys()) n_ord = X_ord.shape[1] if d_type in ['abdm', 'abdm-mvdm'] and len(cat_keys) != n_ord: fnames = [str(_) for _ in range(n_ord)] disc = Discretizer(X_ord, cat_keys, fnames, percentiles=disc_perc) X_bin = disc.discretize(X_ord) cat_vars_bin = {k: len(disc.names[k]) for k in range(n_ord) if k not in cat_keys} else: X_bin = X_ord cat_vars_bin = {} # pairwise distances for categorical variables if d_type == 'abdm': d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin) elif d_type == 'mvdm': d_pair = mvdm(X_ord, y, cat_vars_ord, alpha=1) if (type(feature_range[0]) == type(feature_range[1]) and # noqa type(feature_range[0]) in [int, float]): feature_range = (np.ones((1, n_ord)) * feature_range[0], np.ones((1, n_ord)) * feature_range[1]) if d_type == 'abdm-mvdm': # pairwise distances d_abdm = abdm(X_bin, cat_vars_ord, cat_vars_bin) d_mvdm = mvdm(X_ord, y, cat_vars_ord, alpha=1) # multidim scaled distances d_abs_abdm = multidim_scaling(d_abdm, n_components=2, use_metric=True, feature_range=feature_range, standardize_cat_vars=standardize_cat_vars, smooth=smooth, center=center, update_feature_range=False)[0] d_abs_mvdm = multidim_scaling(d_mvdm, n_components=2, use_metric=True, feature_range=feature_range, standardize_cat_vars=standardize_cat_vars, smooth=smooth, center=center, update_feature_range=False)[0] # combine abdm and mvdm for k, v in d_abs_abdm.items(): self.d_abs[k] = v * w + d_abs_mvdm[k] * (1 - w) if center: # center the numerical feature values self.d_abs[k] -= .5 * (self.d_abs[k].max() + self.d_abs[k].min()) else: self.d_abs = multidim_scaling(d_pair, n_components=2, use_metric=True, feature_range=feature_range, standardize_cat_vars=standardize_cat_vars, smooth=smooth, center=center, update_feature_range=False)[0]