def preprocess_drift(x: Union[np.ndarray, list], model: tf.keras.Model, preprocess_batch_fn: Callable = None, tokenizer: Callable = None, max_len: int = None, batch_size: int = int(1e10), dtype: np.dtype = np.float32) \ -> Union[np.ndarray, tf.Tensor]: """ Prediction function used for preprocessing step of drift detector. Parameters ---------- x Batch of instances. model Model used for preprocessing. preprocess_batch_fn Optional batch preprocessing function. For example to convert a list of objects to a batch which can be processed by the TensorFlow model. tokenizer Optional tokenizer for text drift. max_len Optional max token length for text drift. batch_size Batch size. dtype Model output type, e.g. np.float32 or tf.float32. Returns ------- Numpy array with predictions. """ if tokenizer is None: return predict_batch(x, model, batch_size=batch_size, preprocess_fn=preprocess_batch_fn, dtype=dtype) else: return predict_batch_transformer(x, model, tokenizer, max_len, batch_size=batch_size, dtype=dtype)
def score(self, X: np.ndarray, outlier_perc: float = 100., batch_size: int = int(1e10)) \ -> Tuple[np.ndarray, np.ndarray]: """ Compute feature and instance level outlier scores. Parameters ---------- X Univariate or multivariate time series. outlier_perc Percentage of sorted feature level outlier scores used to predict instance level outlier. batch_size Batch size used when making predictions with the seq2seq model. Returns ------- Feature and instance level outlier scores. """ # use the seq2seq model to reconstruct instances orig_shape = X.shape if len(orig_shape) == 2: X = X.reshape(self.shape) X_recon, threshold_est = predict_batch(X, self.seq2seq.decode_seq, batch_size=batch_size) if len(orig_shape) == 2: # reshape back to original shape X = X.reshape(orig_shape) X_recon = X_recon.reshape(orig_shape) threshold_est = threshold_est.reshape(orig_shape) # compute feature and instance level scores fscore = self.feature_score(X, X_recon, threshold_est) iscore = self.instance_score(fscore, outlier_perc=outlier_perc) return fscore, iscore
def score(self, X: np.ndarray, outlier_perc: float = 100., batch_size: int = int(1e10)) \ -> Tuple[np.ndarray, np.ndarray]: """ Compute feature and instance level outlier scores. Parameters ---------- X Batch of instances. outlier_perc Percentage of sorted feature level outlier scores used to predict instance level outlier. batch_size Batch size used when making predictions with the VAE. Returns ------- Feature and instance level outlier scores. """ # sample reconstructed instances X_samples = np.repeat(X, self.samples, axis=0) X_recon = predict_batch(X_samples, self.vae, batch_size=batch_size) # compute feature and instance level scores fscore = self.feature_score(X_samples, X_recon) iscore = self.instance_score(fscore, outlier_perc=outlier_perc) return fscore, iscore
def score(self, X: np.ndarray, outlier_perc: float = 100., batch_size: int = int(1e10)) \ -> Tuple[np.ndarray, np.ndarray]: """ Compute feature and instance level outlier scores. Parameters ---------- X Batch of instances. outlier_perc Percentage of sorted feature level outlier scores used to predict instance level outlier. batch_size Batch size used when making predictions with the autoencoder. Returns ------- Feature and instance level outlier scores. """ # reconstruct instances X_recon = predict_batch(X, self.ae, batch_size=batch_size) # compute feature and instance level scores fscore = self.feature_score(X, X_recon) # type: ignore[arg-type] iscore = self.instance_score(fscore, outlier_perc=outlier_perc) return fscore, iscore
def logp_alt(self, model: tf.keras.Model, X: np.ndarray, return_per_feature: bool = False, batch_size: int = int(1e10)) -> np.ndarray: """ Compute log probability of a batch of instances using the log_prob function defined by the user. Parameters ---------- model Trained model. X Batch of instances. return_per_feature Return log probability per feature. batch_size Batch size for the generative model evaluations. Returns ------- Log probabilities. """ if self.sequential: y, X = X[:, 1:], X[:, :-1] else: y = X.copy() y_preds = predict_batch(X, model, batch_size=batch_size) logp = self.log_prob(y, y_preds).numpy() if return_per_feature: return logp else: axis = tuple(np.arange(len(logp.shape))[1:]) return np.mean(logp, axis=axis)
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> np.ndarray: """ Compute outlier scores. Parameters ---------- X Batch of instances to analyze. batch_size Batch size used when making predictions with the VAEGMM. Returns ------- Array with outlier scores for each instance in the batch. """ # draw samples from latent space X_samples = np.repeat(X, self.samples, axis=0) _, z, _ = predict_batch(X_samples, self.vaegmm, batch_size=batch_size) # compute average energy for samples energy, _ = gmm_energy(z, self.phi, self.mu, self.cov, self.L, self.log_det_cov, return_mean=False) energy_samples = energy.numpy().reshape((-1, self.samples)) iscore = np.mean(energy_samples, axis=-1) return iscore
def score(self, X: np.ndarray, batch_size: int = int(1e10), return_predictions: bool = False) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray, np.ndarray]]: """ Compute adversarial scores. Parameters ---------- X Batch of instances to analyze. batch_size Batch size used when computing scores. return_predictions Whether to return the predictions of the classifier on the original and reconstructed instances. Returns ------- Array with adversarial scores for each instance in the batch. """ # reconstructed instances X_recon = predict_batch(X, self.ae, batch_size=batch_size) # model predictions y = predict_batch(X, self.model, batch_size=batch_size) y_recon = predict_batch(X_recon, self.model, batch_size=batch_size) y = cast(np.ndarray, y) # help mypy out y_recon = cast(np.ndarray, y_recon) # help mypy out # scale predictions if self.temperature != 1.: y = y**(1 / self.temperature) y = (y / tf.reshape(tf.reduce_sum(y, axis=-1), (-1, 1))).numpy() adv_score = kld(y, y_recon).numpy() # hidden layer predictions if isinstance(self.model_hl, list): for m, w in zip(self.model_hl, self.w_model_hl): h = predict_batch(X, m, batch_size=batch_size) h_recon = predict_batch(X_recon, m, batch_size=batch_size) adv_score += w * kld(h, h_recon).numpy() if return_predictions: return adv_score, y, y_recon else: return adv_score
def score(self, X: np.ndarray, batch_size: int = int(1e10), return_predictions: bool = False) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray, np.ndarray]]: """ Compute adversarial scores. Parameters ---------- X Batch of instances to analyze. batch_size Batch size used when computing scores. return_predictions Whether to return the predictions of the classifier on the original and reconstructed instances. Returns ------- Array with adversarial scores for each instance in the batch. """ # model predictions y = predict_batch(X, self.model, batch_size=batch_size) y_distilled = predict_batch(X, self.distilled_model, batch_size=batch_size) # scale predictions if self.temperature != 1.: y = y**(1 / self.temperature) # type: ignore y = (y / tf.reshape(tf.reduce_sum(y, axis=-1), (-1, 1))).numpy() if self.loss_type == 'kld': score = kld(y, y_distilled).numpy() elif self.loss_type == 'xent': score = categorical_crossentropy(y, y_distilled).numpy() else: raise NotImplementedError if return_predictions: return score, y, y_distilled else: return score
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> np.ndarray: """ Compute outlier scores. Parameters ---------- X Batch of instances to analyze. batch_size Batch size used when making predictions with the AEGMM. Returns ------- Array with outlier scores for each instance in the batch. """ _, z, _ = predict_batch(X, self.aegmm, batch_size=batch_size) energy, _ = gmm_energy(z, self.phi, self.mu, self.cov, self.L, self.log_det_cov, return_mean=False) return energy.numpy()
def logp(self, dist, X: np.ndarray, return_per_feature: bool = False, batch_size: int = int(1e10)) \ -> np.ndarray: """ Compute log probability of a batch of instances under the generative model. Parameters ---------- dist Distribution of the model. X Batch of instances. return_per_feature Return log probability per feature. batch_size Batch size for the generative model evaluations. Returns ------- Log probabilities. """ logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature) return predict_batch(X, logp_fn, batch_size=batch_size)
def preprocess_drift(x: np.ndarray, model: tf.keras.Model, tokenizer=None, max_len: int = None, batch_size: int = int(1e10), dtype: type = np.float32) -> Union[np.ndarray, tf.Tensor]: """ Prediction function used for preprocessing step of drift detector. Parameters ---------- x Batch of instances. model Model used for preprocessing. tokenizer Optional tokenizer for text drift. max_len Optional max token length for text drift. batch_size Batch size. dtype Model output type, e.g. np.float32 or torch.float32. Returns ------- Numpy array with predictions. """ if tokenizer is None: return predict_batch(x, model, batch_size=batch_size, dtype=dtype) else: return predict_batch_transformer(x, model, tokenizer, max_len, batch_size=batch_size, dtype=dtype)
def logp(self, dist, X: np.ndarray, return_per_feature: bool = False, batch_size: int = int(1e10)) \ -> np.ndarray: """ Compute log probability of a batch of instances under the generative model. Parameters ---------- dist Distribution of the model. X Batch of instances. return_per_feature Return log probability per feature. batch_size Batch size for the generative model evaluations. Returns ------- Log probabilities. """ logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature) # TODO: TBD: can this be any of the other types from predict_batch? i.e. tf.Tensor or tuple return predict_batch( X, logp_fn, batch_size=batch_size) # type: ignore[return-value]
def fit(self, X: np.ndarray, mutate_fn: Callable = mutate_categorical, mutate_fn_kwargs: dict = { 'rate': .2, 'seed': 0, 'feature_range': (0, 255) }, mutate_batch_size: int = int(1e10), loss_fn: tf.keras.losses = None, loss_fn_kwargs: dict = None, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam( learning_rate=1e-3), epochs: int = 20, batch_size: int = 64, verbose: bool = True, log_metric: Tuple[str, "tf.keras.metrics"] = None, callbacks: tf.keras.callbacks = None) -> None: """ Train semantic and background generative models. Parameters ---------- X Training batch. mutate_fn Mutation function used to generate the background dataset. mutate_fn_kwargs Kwargs for the mutation function used to generate the background dataset. Default values set for an image dataset. mutate_batch_size Batch size used to generate the mutations for the background dataset. loss_fn Loss function used for training. loss_fn_kwargs Kwargs for loss function. optimizer Optimizer used for training. epochs Number of training epochs. batch_size Batch size used for training. verbose Whether to print training progress. log_metric Additional metrics whose progress will be displayed if verbose equals True. callbacks Callbacks used during training. """ input_shape = X.shape[1:] # training arguments kwargs = { 'epochs': epochs, 'batch_size': batch_size, 'verbose': verbose, 'callbacks': callbacks } # create background data mutate_fn = partial(mutate_fn, **mutate_fn_kwargs) X_back = predict_batch(X, mutate_fn, batch_size=mutate_batch_size, dtype=X.dtype) # prepare sequential data if self.sequential and not self.has_log_prob: y, y_back = X[:, 1:], X_back[:, 1:] # type: ignore X, X_back = X[:, :-1], X_back[:, :-1] # type: ignore else: y, y_back = None, None # check if model needs to be built use_build = True if self.has_log_prob and not isinstance( self.dist_s, tf.keras.Model) else False if use_build: # build and train semantic model self.model_s = build_model(self.dist_s, input_shape)[0] self.model_s.compile(optimizer=optimizer) self.model_s.fit(X, **kwargs) # build and train background model self.model_b = build_model(self.dist_b, input_shape)[0] self.model_b.compile(optimizer=optimizer) self.model_b.fit(X_back, **kwargs) else: # update training arguments kwargs.update({ 'optimizer': optimizer, 'loss_fn_kwargs': loss_fn_kwargs, 'log_metric': log_metric }) # train semantic model args = [self.dist_s, loss_fn, X] kwargs.update({'y_train': y}) trainer(*args, **kwargs) # train background model args = [self.dist_b, loss_fn, X_back] kwargs.update({'y_train': y_back}) trainer(*args, **kwargs)
def __init__(self, x_ref: np.ndarray, p_val: float = .05, preprocess_fn: Optional[Callable] = None, kernel: Optional[tf.keras.Model] = None, n_diffs: int = 1, initial_diffs: Optional[np.ndarray] = None, l1_reg: float = 0.01, binarize_preds: bool = False, train_size: Optional[float] = .75, n_folds: Optional[int] = None, retrain_from_scratch: bool = True, seed: int = 0, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam, learning_rate: float = 1e-3, batch_size: int = 32, preprocess_batch_fn: Optional[Callable] = None, epochs: int = 3, verbose: int = 0, train_kwargs: Optional[dict] = None, dataset: Callable = TFDataset, data_type: Optional[str] = None) -> None: """ Classifier-based drift detector with a classifier of form y = a + b_1*k(x,w_1) + ... + b_J*k(x,w_J), where k is a kernel and w_1,...,w_J are learnable test locations. If drift has occured the test locations learn to be more/less (given by sign of b_i) similar to test instances than reference instances. The test locations are regularised to be close to the average reference instance such that the **difference** is then interpretable as the transformation required for each feature to make the average instance more/less like a test instance than a reference instance. The classifier is trained on a fraction of the combined reference and test data and drift is detected on the remaining data. To use all the data to detect drift, a stratified cross-validation scheme can be chosen. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for the significance of the test. preprocess_fn Function to preprocess the data before computing the data drift metrics. kernel Differentiable TensorFlow model used to define similarity between instances, defaults to Gaussian RBF. n_diffs The number of test locations to use, each corresponding to an interpretable difference. initial_diffs Array used to initialise the diffs that will be learned. Defaults to Gaussian for each feature with equal variance to that of reference data. l1_reg Strength of l1 regularisation to apply to the differences. binarize_preds Whether to test for discrepency on soft (e.g. probs/logits) model predictions directly with a K-S test or binarise to 0-1 prediction errors and apply a binomial test. train_size Optional fraction (float between 0 and 1) of the dataset used to train the classifier. The drift is detected on `1 - train_size`. Cannot be used in combination with `n_folds`. n_folds Optional number of stratified folds used for training. The model preds are then calculated on all the out-of-fold instances. This allows to leverage all the reference and test data for drift detection at the expense of longer computation. If both `train_size` and `n_folds` are specified, `n_folds` is prioritized. retrain_from_scratch Whether the classifier should be retrained from scratch for each set of test data or whether it should instead continue training from where it left off on the previous set. seed Optional random seed for fold selection. optimizer Optimizer used during training of the classifier. learning_rate Learning rate used by optimizer. batch_size Batch size used during training of the classifier. preprocess_batch_fn Optional batch preprocessing function. For example to convert a list of objects to a batch which can be processed by the model. epochs Number of training epochs for the classifier for each (optional) fold. verbose Verbosity level during the training of the classifier. 0 is silent, 1 a progress bar. train_kwargs Optional additional kwargs when fitting the classifier. dataset Dataset object used during training. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ if preprocess_fn is not None and preprocess_batch_fn is not None: raise ValueError( "SpotTheDiffDrift detector only supports preprocess_fn or preprocess_batch_fn, not both." ) if n_folds is not None and n_folds > 1: logger.warning( "When using multiple folds the returned diffs will correspond to the final fold only." ) if preprocess_fn is not None: x_ref_proc = preprocess_fn(x_ref) elif preprocess_batch_fn is not None: x_ref_proc = predict_batch(x_ref, lambda x: x, preprocess_fn=preprocess_batch_fn, batch_size=batch_size) else: x_ref_proc = x_ref if kernel is None: kernel = GaussianRBF(trainable=True) if initial_diffs is None: initial_diffs = np.random.normal( size=(n_diffs, ) + x_ref_proc.shape[1:]) * x_ref_proc.std(0) else: if len(initial_diffs) != n_diffs: raise ValueError( "Should have initial_diffs.shape[0] == n_diffs") model = SpotTheDiffDriftTF.InterpretableClf(kernel, x_ref_proc, initial_diffs) reg_loss_fn = ( lambda model: tf.reduce_mean(tf.abs(model.diffs)) * l1_reg) self._detector = ClassifierDriftTF( x_ref=x_ref, model=model, p_val=p_val, preprocess_x_ref=True, update_x_ref=None, preprocess_fn=preprocess_fn, preds_type='logits', binarize_preds=binarize_preds, reg_loss_fn=reg_loss_fn, train_size=train_size, n_folds=n_folds, retrain_from_scratch=retrain_from_scratch, seed=seed, optimizer=optimizer, learning_rate=learning_rate, batch_size=batch_size, preprocess_batch_fn=preprocess_batch_fn, epochs=epochs, verbose=verbose, train_kwargs=train_kwargs, dataset=dataset, data_type=data_type) self.meta = self._detector.meta self.meta['params']['name'] = 'SpotTheDiffDrift' self.meta['params']['n_diffs'] = n_diffs self.meta['params']['l1_reg'] = l1_reg self.meta['params']['initial_diffs'] = initial_diffs