def inject_outlier_ts(X: np.ndarray, perc_outlier: int, perc_window: int = 10, n_std: float = 2., min_std: float = 1.) -> Bunch: """ Inject outliers in both univariate and multivariate time series data. Parameters ---------- X Time series data to perturb (inject outliers). perc_outlier Percentage of observations which are perturbed to outliers. For multivariate data, the percentage is evenly split across the individual time series. perc_window Percentage of the observations used to compute the standard deviation used in the perturbation. n_std Number of standard deviations in the window used to perturb the original data. min_std Minimum number of standard deviations away from the current observation. This is included because of the stochastic nature of the perturbation which could lead to minimal perturbations without a floor. Returns ------- Bunch object with the perturbed time series and the outlier labels. """ n_dim = len(X.shape) if n_dim == 1: X = X.reshape(-1, 1) n_samples, n_ts = X.shape X_outlier = X.copy() is_outlier = np.zeros(n_samples) # one sided window used to compute mean and stdev from window = int(perc_window * n_samples * .5 / 100) # distribute outliers evenly over different time series n_outlier = int(n_samples * perc_outlier * .01 / n_ts) if n_outlier == 0: return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier']) for s in range(n_ts): outlier_idx = np.sort(random.sample(range(n_samples), n_outlier)) window_idx = [ np.maximum(outlier_idx - window, 0), np.minimum(outlier_idx + window, n_samples) ] stdev = np.array([ X_outlier[window_idx[0][i]:window_idx[1][i], s].std() for i in range(len(outlier_idx)) ]) rnd = np.random.normal(size=n_outlier) X_outlier[outlier_idx, s] += np.sign(rnd) * np.maximum( np.abs(rnd * n_std), min_std) * stdev is_outlier[outlier_idx] = 1 if n_dim == 1: X_outlier = X_outlier.reshape(n_samples, ) return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier'])
def inject_outlier_tabular(X: np.ndarray, cols: List[int], perc_outlier: int, y: np.ndarray = None, n_std: float = 2., min_std: float = 1. ) -> Bunch: """ Inject outliers in numerical tabular data. Parameters ---------- X Tabular data to perturb (inject outliers). cols Columns of X that are numerical and can be perturbed. perc_outlier Percentage of observations which are perturbed to outliers. For multiple numerical features, the percentage is evenly split across the features. y Outlier labels. n_std Number of feature-wise standard deviations used to perturb the original data. min_std Minimum number of standard deviations away from the current observation. This is included because of the stochastic nature of the perturbation which could lead to minimal perturbations without a floor. Returns ------- Bunch object with the perturbed tabular data and the outlier labels. """ n_dim = len(X.shape) if n_dim == 1: X = X.reshape(-1, 1) n_samples, n_features = X.shape X_outlier = X.astype(np.float32).copy() if y is None: is_outlier = np.zeros(n_samples) else: is_outlier = y n_cols = len(cols) # distribute outliers evenly over different columns n_outlier = int(n_samples * perc_outlier * .01 / n_cols) if n_outlier == 0: return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier']) # add perturbations stdev = X_outlier.std(axis=0) for col in cols: outlier_idx = np.sort(random.sample(range(n_samples), n_outlier)) rnd = np.random.normal(size=n_outlier) X_outlier[outlier_idx, col] += np.sign(rnd) * np.maximum(np.abs(rnd * n_std), min_std) * stdev[col] is_outlier[outlier_idx] = 1 if n_dim == 1: X_outlier = X_outlier.reshape(n_samples, ) return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier'])
def fetch_ecg(return_X_y: bool = False) \ -> Union[Bunch, Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]: """ Fetch ECG5000 data. The dataset contains 5000 ECG's, originally obtained from Physionet (https://archive.physionet.org/cgi-bin/atm/ATM) under the name "BIDMC Congestive Heart Failure Database(chfdb)", record "chf07". Parameters ---------- return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Train and test datasets with labels. (train data, train target), (test data, test target) Tuple of tuples if 'return_X_y' equals True. """ Xy_train = load_url_arff( 'https://storage.googleapis.com/seldon-datasets/ecg/ECG5000_TRAIN.arff' ) X_train, y_train = Xy_train[:, :-1], Xy_train[:, -1] Xy_test = load_url_arff( 'https://storage.googleapis.com/seldon-datasets/ecg/ECG5000_TEST.arff') X_test, y_test = Xy_test[:, :-1], Xy_test[:, -1] if return_X_y: return (X_train, y_train), (X_test, y_test) else: return Bunch(data_train=X_train, data_test=X_test, target_train=y_train, target_test=y_test)
def fetch_cifar10c(corruption: Union[str, List[str]], severity: int, return_X_y: bool = False) \ -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]: """ Fetch CIFAR-10-C data. Originally obtained from https://zenodo.org/record/2535967#.XkKh2XX7Qts and introduced in "Hendrycks, D and Dietterich, T.G. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. In 7th International Conference on Learning Represenations, 2019.". Parameters ---------- corruption Corruption type. Options can be checked with `get_corruption_cifar10c()`. Alternatively, specify 'all' for all corruptions at a severity level. severity Severity level of corruption (1-5). return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Corrupted dataset with labels. (corrupted data, target) Tuple if 'return_X_y' equals True. """ url = 'https://storage.googleapis.com/seldon-datasets/cifar10c/' n = 10000 # instances per corrupted test set istart, iend = (severity - 1) * n, severity * n # idx for the relevant severity level corruption_list = corruption_types_cifar10c( ) # get all possible corruption types # convert input to list if isinstance(corruption, str) and corruption != 'all': corruption = [corruption] elif corruption == 'all': corruption = corruption_list for corr in corruption: # check values in corruptions if corr not in corruption_list: raise ValueError(f'{corr} is not a valid corruption type.') # get corrupted data shape = ((len(corruption)) * n, 32, 32, 3) X = np.zeros(shape) for i, corr in enumerate(corruption): url_corruption = os.path.join(url, corr + '.npy') resp = requests.get(url_corruption) X_corr = np.load(BytesIO(resp.content))[istart:iend].astype('float32') X[i * n:(i + 1) * n] = X_corr # get labels url_labels = os.path.join(url, 'labels.npy') resp = requests.get(url_labels) y = np.load(BytesIO(resp.content))[istart:iend].astype('int64') if X.shape[0] != y.shape[0]: repeat = X.shape[0] // y.shape[0] y = np.tile(y, (repeat, )) if return_X_y: return (X, y) else: return Bunch(data=X, target=y)
def fetch_genome(return_X_y: bool = False, return_labels: bool = False) -> Union[Bunch, tuple]: """ Load genome data including their labels and whether they are outliers or not. More details about the data can be found in the readme on https://console.cloud.google.com/storage/browser/seldon-datasets/genome/. The original data can be found here: https://drive.google.com/drive/folders/1Ht9xmzyYPbDouUTl_KQdLTJQYX2CuclR. Parameters ---------- return_X_y Bool, whether to only return the data and target values or a Bunch object. return_labels Whether to return the genome labels which are detailed in the `label_json` dict of the returned Bunch object. Returns ------- Bunch Training, validation and test data, whether they are outliers and optionally including the genome labels which are specified in the `label_json` key as a dictionary. (data, outlier) or (data, outlier, target) Tuple for the train, validation and test set with either the data and whether they are outliers or the data, outlier flag and labels for the genomes if 'return_X_y' equals True. """ data_train = load_genome_npz('train_in', return_labels=return_labels) data_val_in = load_genome_npz('val_in', return_labels=return_labels) data_val_ood = load_genome_npz('val_ood', return_labels=return_labels) data_val = (np.concatenate([data_val_in[0], data_val_ood[0]]), np.concatenate([data_val_in[1], data_val_ood[1]])) data_test_in = load_genome_npz('test_in', return_labels=return_labels) data_test_ood = load_genome_npz('test_ood', return_labels=return_labels) data_test = (np.concatenate([data_test_in[0], data_test_ood[0]]), np.concatenate([data_test_in[1], data_test_ood[1]])) if return_labels: data_val += (np.concatenate([data_val_in[2], data_val_ood[2]]), ) # type: ignore data_test += (np.concatenate([data_test_in[2], data_test_ood[2]]), ) # type: ignore if return_X_y: return data_train, data_val, data_test resp = requests.get( 'https://storage.googleapis.com/seldon-datasets/genome/label_dict.json' ) label_dict = resp.json() bunch = Bunch(data_train=data_train[0], data_val=data_val[0], data_test=data_test[0], outlier_train=data_train[1], outlier_val=data_val[1], outlier_test=data_test[1], label_dict=label_dict) if not return_labels: return bunch else: bunch['target_train'] = data_train[2] # type: ignore bunch['target_val'] = data_val[2] # type: ignore bunch['target_test'] = data_test[2] # type: ignore return bunch
def fetch_attack(dataset: str, model: str, attack: str, return_X_y: bool = False) \ -> Union[Bunch, Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]: """ Load adversarial instances for a given dataset, model and attack type. Parameters ---------- dataset Dataset under attack. model Model under attack. attack Attack name. return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Adversarial instances with original labels. (train data, train target), (test data, test target) Tuple of tuples if 'return_X_y' equals True. """ # define paths url = 'https://storage.googleapis.com/seldon-datasets/' path_attack = os.path.join(url, dataset, 'attacks', model, attack) path_data = path_attack + '.npz' path_meta = path_attack + '_meta.pickle' # get adversarial instances and labels try: resp = requests.get(path_data, timeout=2) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise data = np.load(BytesIO(resp.content)) X_train, X_test = data['X_train_adv'], data['X_test_adv'] y_train, y_test = data['y_train'], data['y_test'] if return_X_y: return (X_train, y_train), (X_test, y_test) # get metadata try: resp = requests.get(path_meta, timeout=2) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise meta = dill.load(BytesIO(resp.content)) return Bunch(data_train=X_train, data_test=X_test, target_train=y_train, target_test=y_test, meta=meta)
def fetch_nab( ts: str, return_X_y: bool = False ) -> Union[Bunch, Tuple[pd.DataFrame, pd.DataFrame]]: """ Get time series in a DataFrame from the Numenta Anomaly Benchmark: https://github.com/numenta/NAB. Parameters ---------- ts return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Dataset and outlier labels (0 means 'normal' and 1 means 'outlier') in DataFrames with timestamps. (data, target) Tuple if 'return_X_y' equals True. """ url_labels = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json' try: resp = requests.get(url_labels, timeout=2) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise labels_json = resp.json() outliers = labels_json[ts + '.csv'] if not outliers: logger.warning('The dataset does not contain any outliers.') url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/' + ts + '.csv' df = pd.read_csv(url, header=0, index_col=0) labels = np.zeros(df.shape[0]) for outlier in outliers: outlier_id = np.where(df.index == outlier)[0][0] labels[outlier_id] = 1 df.index = pd.to_datetime(df.index) df_labels = pd.DataFrame(data={'is_outlier': labels}, index=df.index) if return_X_y: return df, df_labels return Bunch(data=df, target=df_labels, target_names=['normal', 'outlier'])
def fetch_kdd( target: list = ['dos', 'r2l', 'u2r', 'probe'], keep_cols: list = [ 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' ], percent10: bool = True, return_X_y: bool = False ) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]: """ KDD Cup '99 dataset. Detect computer network intrusions. Parameters ---------- target List with attack types to detect. keep_cols List with columns to keep. Defaults to continuous features. percent10 Bool, whether to only return 10% of the data. return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Dataset and outlier labels (0 means 'normal' and 1 means 'outlier'). (data, target) Tuple if 'return_X_y' equals True. """ # fetch raw data data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10) # specify columns cols = [ 'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' ] # create dataframe data = pd.DataFrame(data=data_raw['data'], columns=cols) # add target to dataframe data['attack_type'] = data_raw['target'] # specify and map attack types attack_list = np.unique(data['attack_type']) attack_category = [ 'dos', 'u2r', 'r2l', 'r2l', 'r2l', 'probe', 'dos', 'u2r', 'r2l', 'dos', 'probe', 'normal', 'u2r', 'r2l', 'dos', 'probe', 'u2r', 'probe', 'dos', 'r2l', 'dos', 'r2l', 'r2l' ] attack_types = {} for i, j in zip(attack_list, attack_category): attack_types[i] = j data['attack_category'] = 'normal' for k, v in attack_types.items(): data['attack_category'][data['attack_type'] == k] = v # define target data['target'] = 0 for t in target: data['target'][data['attack_category'] == t] = 1 is_outlier = data['target'].values # define columns to be dropped drop_cols = [] for col in data.columns.values: if col not in keep_cols: drop_cols.append(col) if drop_cols != []: data.drop(columns=drop_cols, inplace=True) if return_X_y: return data.values, is_outlier return Bunch(data=data.values, target=is_outlier, target_names=['normal', 'outlier'], feature_names=keep_cols)
def inject_outlier_categorical(X: np.ndarray, cols: List[int], perc_outlier: int, y: np.ndarray = None, cat_perturb: dict = None, X_fit: np.ndarray = None, disc_perc: list = [25, 50, 75], smooth: float = 1.) -> Bunch: """ Inject outliers in categorical variables of tabular data. Parameters ---------- X Tabular data with categorical variables to perturb (inject outliers). cols Columns of X that are categorical and can be perturbed. perc_outlier Percentage of observations which are perturbed to outliers. For multiple numerical features, the percentage is evenly split across the features. y Outlier labels. cat_perturb Dictionary mapping each category in the categorical variables to their furthest neighbour. X_fit Optional data used to infer pairwise distances from. disc_perc List with percentiles used in binning of numerical features used for the 'abdm' pairwise distance measure. smooth Smoothing exponent between 0 and 1 for the distances. Lower values will smooth the difference in distance metric between different features. Returns ------- Bunch object with the perturbed tabular data, outlier labels and a dictionary used to map categories to their furthest neighbour. """ if cat_perturb is None: # transform the categorical variables into numerical ones via # pairwise distances computed with abdm and multidim scaling X_fit = X.copy() if X_fit is None else X_fit # find number of categories for each categorical variable cat_vars = {k: None for k in cols} for k in cols: cat_vars[k] = len(np.unique(X_fit[:, k])) # type: ignore # TODO: extend method for OHE ohe = False if ohe: X_ord, cat_vars_ord = ohe2ord(X, cat_vars) else: X_ord, cat_vars_ord = X, cat_vars # bin numerical features to compute the pairwise distance matrices n_ord = X_ord.shape[1] if len(cols) != n_ord: fnames = [str(_) for _ in range(n_ord)] disc = Discretizer(X_ord, cols, fnames, percentiles=disc_perc) X_bin = disc.discretize(X_ord) cat_vars_bin = { k: len(disc.names[k]) for k in range(n_ord) if k not in cols } else: X_bin = X_ord cat_vars_bin = {} # pairwise distances for categorical variables d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin) # multidim scaling feature_range = (np.ones((1, n_ord)) * -1e10, np.ones( (1, n_ord)) * 1e10) d_abs = multidim_scaling(d_pair, n_components=2, use_metric=True, standardize_cat_vars=True, smooth=smooth, feature_range=feature_range, update_feature_range=False)[0] # find furthest category away for each category in the categorical variables cat_perturb = {k: np.zeros(len(v)) for k, v in d_abs.items()} for k, v in d_abs.items(): for i in range(len(v)): cat_perturb[k][i] = np.argmax(np.abs(v[i] - v)) else: d_abs = None n_dim = len(X.shape) if n_dim == 1: X = X.reshape(-1, 1) n_samples, n_features = X.shape X_outlier = X.astype(np.float32).copy() if y is None: is_outlier = np.zeros(n_samples) else: is_outlier = y n_cols = len(cols) # distribute outliers evenly over different columns n_outlier = int(n_samples * perc_outlier * .01 / n_cols) for col in cols: outlier_idx = np.sort(random.sample(range(n_samples), n_outlier)) col_cat = X_outlier[outlier_idx, col].astype(int) col_map = np.tile(cat_perturb[col], (n_outlier, 1)) X_outlier[outlier_idx, col] = np.diag(col_map.T[col_cat]) is_outlier[outlier_idx] = 1 if n_dim == 1: X_outlier = X_outlier.reshape(n_samples, ) return Bunch(data=X_outlier, target=is_outlier, cat_perturb=cat_perturb, d_abs=d_abs, target_names=['normal', 'outlier'])