def __init__( self, x_ref: Union[np.ndarray, list], ert: float, window_size: int, preprocess_fn: Optional[Callable] = None, n_bootstraps: int = 1000, verbose: bool = True, input_shape: Optional[tuple] = None, data_type: Optional[str] = None, ) -> None: """ Base class for online drift detectors. Parameters ---------- x_ref Data used as reference distribution. ert The expected run-time (ERT) in the absence of drift. window_size The size of the sliding test-window used to compute the test-statistic. Smaller windows focus on responding quickly to severe drift, larger windows focus on ability to detect slight drift. preprocess_fn Function to preprocess the data before computing the data drift metrics. n_bootstraps The number of bootstrap simulations used to configure the thresholds. The larger this is the more accurately the desired ERT will be targeted. Should ideally be at least an order of magnitude larger than the ert. verbose Whether or not to print progress during configuration. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__() if ert is None: logger.warning( 'No expected run-time set for the drift threshold. Need to set it to detect data drift.' ) self.ert = ert self.fpr = 1 / ert self.window_size = window_size # Preprocess reference data if isinstance(preprocess_fn, Callable): # type: ignore self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref # Other attributes self.preprocess_fn = preprocess_fn self.n = len(x_ref) # type: ignore self.n_bootstraps = n_bootstraps # nb of samples used to estimate thresholds self.verbose = verbose # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # set metadata self.meta['detector_type'] = 'online' self.meta['data_type'] = data_type
def __init__( self, x_ref: Union[np.ndarray, list], p_val: float = .05, preprocess_x_ref: bool = True, update_x_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, correction: str = 'bonferroni', n_features: Optional[int] = None, input_shape: Optional[tuple] = None, data_type: Optional[str] = None ) -> None: """ Generic drift detector component which serves as a base class for methods using univariate tests with multivariate correction. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for significance of the statistical test for each feature. If the FDR correction method is used, this corresponds to the acceptable q-value. preprocess_x_ref Whether to already preprocess and store the reference data. update_x_ref Reference data can optionally be updated to the last n instances seen by the detector or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while for reservoir sampling {'reservoir_sampling': n} is passed. preprocess_fn Function to preprocess the data before computing the data drift metrics. Typically a dimensionality reduction technique. correction Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False Discovery Rate). n_features Number of features used in the statistical test. No need to pass it if no preprocessing takes place. In case of a preprocessing step, this can also be inferred automatically but could be more expensive to compute. input_shape Shape of input data. Needs to be provided for text data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__() if p_val is None: logger.warning('No p-value set for the drift threshold. Need to set it to detect data drift.') # optionally already preprocess reference data self.p_val = p_val if preprocess_x_ref and isinstance(preprocess_fn, Callable): # type: ignore self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref self.preprocess_x_ref = preprocess_x_ref self.update_x_ref = update_x_ref self.preprocess_fn = preprocess_fn self.correction = correction self.n = len(x_ref) # type: ignore # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # compute number of features for the univariate tests if isinstance(n_features, int): self.n_features = n_features elif not isinstance(preprocess_fn, Callable) or preprocess_x_ref: # infer features from preprocessed reference data self.n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1] else: # infer number of features after applying preprocessing step x = self.preprocess_fn(x_ref[0:1]) self.n_features = x.reshape(x.shape[0], -1).shape[-1] if correction not in ['bonferroni', 'fdr'] and self.n_features > 1: raise ValueError('Only `bonferroni` and `fdr` are acceptable for multivariate correction.') # set metadata self.meta['detector_type'] = 'offline' # offline refers to fitting the CDF for K-S self.meta['data_type'] = data_type
def __init__( self, x_ref: Union[np.ndarray, list], p_val: float = .05, preprocess_x_ref: bool = True, update_x_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, sigma: Optional[np.ndarray] = None, n_permutations: int = 100, n_kernel_centers: Optional[int] = None, lambda_rd_max: float = 0.2, input_shape: Optional[tuple] = None, data_type: Optional[str] = None ) -> None: """ Least-squares Density Difference (LSDD) base data drift detector using a permutation test. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for the significance of the permutation test. preprocess_x_ref Whether to already preprocess and store the reference data. update_x_ref Reference data can optionally be updated to the last n instances seen by the detector or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while for reservoir sampling {'reservoir_sampling': n} is passed. preprocess_fn Function to preprocess the data before computing the data drift metrics. sigma Optionally set the bandwidth of the Gaussian kernel used in estimating the LSDD. Can also pass multiple bandwidth values as an array. The kernel evaluation is then averaged over those bandwidths. If `sigma` is not specified, the 'median heuristic' is adopted whereby `sigma` is set as the median pairwise distance between reference samples. n_permutations Number of permutations used in the permutation test. n_kernel_centers The number of reference samples to use as centers in the Gaussian kernel model used to estimate LSDD. Defaults to 1/20th of the reference data. lambda_rd_max The maximum relative difference between two estimates of LSDD that the regularization parameter lambda is allowed to cause. Defaults to 0.2 as in the paper. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__() if p_val is None: logger.warning('No p-value set for the drift threshold. Need to set it to detect data drift.') # optionally already preprocess reference data self.p_val = p_val if preprocess_x_ref and isinstance(preprocess_fn, Callable): # type: ignore self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref self.sigma = sigma self.preprocess_x_ref = preprocess_x_ref self.update_x_ref = update_x_ref self.preprocess_fn = preprocess_fn self.n = len(x_ref) # type: ignore self.n_permutations = n_permutations # nb of iterations through permutation test self.n_kernel_centers = n_kernel_centers or max(self.n // 20, 1) self.lambda_rd_max = lambda_rd_max # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # set metadata self.meta.update({'detector_type': 'offline', 'data_type': data_type})
def __init__( self, x_ref: Union[np.ndarray, list], p_val: float = .05, preprocess_x_ref: bool = True, update_x_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, sigma: Optional[np.ndarray] = None, configure_kernel_from_x_ref: bool = True, n_permutations: int = 100, input_shape: Optional[tuple] = None, data_type: Optional[str] = None ) -> None: """ Maximum Mean Discrepancy (MMD) base data drift detector using a permutation test. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for the significance of the permutation test. preprocess_x_ref Whether to already preprocess and store the reference data. update_x_ref Reference data can optionally be updated to the last n instances seen by the detector or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while for reservoir sampling {'reservoir_sampling': n} is passed. preprocess_fn Function to preprocess the data before computing the data drift metrics. sigma Optionally set the Gaussian RBF kernel bandwidth. Can also pass multiple bandwidth values as an array. The kernel evaluation is then averaged over those bandwidths. configure_kernel_from_x_ref Whether to already configure the kernel bandwidth from the reference data. n_permutations Number of permutations used in the permutation test. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__() if p_val is None: logger.warning('No p-value set for the drift threshold. Need to set it to detect data drift.') self.infer_sigma = configure_kernel_from_x_ref if configure_kernel_from_x_ref and isinstance(sigma, np.ndarray): self.infer_sigma = False logger.warning('`sigma` is specified for the kernel and `configure_kernel_from_x_ref` ' 'is set to True. `sigma` argument takes priority over ' '`configure_kernel_from_x_ref` (set to False).') # optionally already preprocess reference data self.p_val = p_val if preprocess_x_ref and isinstance(preprocess_fn, Callable): # type: ignore self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref self.preprocess_x_ref = preprocess_x_ref self.update_x_ref = update_x_ref self.preprocess_fn = preprocess_fn self.n = len(x_ref) # type: ignore self.n_permutations = n_permutations # nb of iterations through permutation test # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # set metadata self.meta.update({'detector_type': 'offline', 'data_type': data_type})
def __init__(self, x_ref: Union[np.ndarray, list], c_ref: np.ndarray, p_val: float = .05, preprocess_x_ref: bool = True, update_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, x_kernel: Callable = None, c_kernel: Callable = None, n_permutations: int = 1000, prop_c_held: float = 0.25, n_folds: int = 5, batch_size: Optional[int] = 256, input_shape: Optional[tuple] = None, data_type: Optional[str] = None, verbose: bool = False) -> None: """ Maximum Mean Discrepancy (MMD) based context aware drift detector. Parameters ---------- x_ref Data used as reference distribution. c_ref Context for the reference distribution. p_val p-value used for the significance of the permutation test. preprocess_x_ref Whether to already preprocess and store the reference data `x_ref`. update_ref Reference data can optionally be updated to the last N instances seen by the detector. The parameter should be passed as a dictionary *{'last': N}*. preprocess_fn Function to preprocess the data before computing the data drift metrics. x_kernel Kernel defined on the input data, defaults to Gaussian RBF kernel. c_kernel Kernel defined on the context data, defaults to Gaussian RBF kernel. n_permutations Number of permutations used in the permutation test. prop_c_held Proportion of contexts held out to condition on. n_folds Number of cross-validation folds used when tuning the regularisation parameters. batch_size If not None, then compute batches of MMDs at a time (rather than all at once). input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. verbose Whether or not to print progress during configuration. """ super().__init__() if p_val is None: logger.warning( 'No p-value set for the drift threshold. Need to set it to detect data drift.' ) # optionally already preprocess reference data self.p_val = p_val if preprocess_x_ref and isinstance(preprocess_fn, Callable): # type: ignore[arg-type] self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref self.preprocess_x_ref = preprocess_x_ref self.preprocess_fn = preprocess_fn self.n = len(x_ref) self.n_permutations = n_permutations # nb of iterations through permutation test self.x_kernel = x_kernel self.c_kernel = c_kernel if len(c_ref) == self.n: self.c_ref = c_ref else: raise ValueError( 'x_ref and c_ref should contain the same number of instances.') # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # Regularisation parameter tuning settings if n_folds > 1: self.n_folds = n_folds else: raise ValueError('The `n_folds` parameter must be > 1.') self.lams = None # Update ref attribute. Disallow res self.update_ref = update_ref if update_ref is not None: if 'reservoir_sampling' in update_ref.keys(): raise ValueError( "The BaseContextMMDDrift detector doesn't currently support the `reservoir_sampling` " "option in `update_ref`.") # Other attributes self.prop_c_held = prop_c_held self.batch_size = batch_size self.verbose = verbose # set metadata self.meta.update({'detector_type': 'offline', 'data_type': data_type})
def __init__( self, x_ref: Union[np.ndarray, list], ert: float, window_sizes: List[int], preprocess_fn: Optional[Callable] = None, n_bootstraps: int = 1000, n_features: Optional[int] = None, verbose: bool = True, input_shape: Optional[tuple] = None, data_type: Optional[str] = None, ) -> None: """ Base class for univariate online drift detectors. If n_features > 1, a multivariate correction is used to aggregate p-values during threshold configuration, thus allowing the requested expected run time (ERT) to be targeted. The multivariate correction assumes independence between the features. Parameters ---------- x_ref Data used as reference distribution. ert The expected run-time (ERT) in the absence of drift. For the univariate detectors, the ERT is defined as the expected run-time after the smallest window is full i.e. the run-time from t=min(windows_sizes)-1. window_sizes The sizes of the sliding test-windows used to compute the test-statistic. Smaller windows focus on responding quickly to severe drift, larger windows focus on ability to detect slight drift. preprocess_fn Function to preprocess the data before computing the data drift metrics. n_bootstraps The number of bootstrap simulations used to configure the thresholds. The larger this is the more accurately the desired ERT will be targeted. Should ideally be at least an order of magnitude larger than the ert. n_features Number of features used in the statistical test. No need to pass it if no preprocessing takes place. In case of a preprocessing step, this can also be inferred automatically but could be more expensive to compute. verbose Whether or not to print progress during configuration. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__() if ert is None: logger.warning( 'No expected run-time set for the drift threshold. Need to set it to detect data drift.' ) self.ert = ert self.fpr = 1 / ert # Window sizes self.window_sizes = window_sizes self.max_ws = np.max(self.window_sizes) self.min_ws = np.min(self.window_sizes) # Preprocess reference data if isinstance(preprocess_fn, Callable): # type: ignore[arg-type] self.x_ref = preprocess_fn(x_ref) else: self.x_ref = x_ref # Check the (optionally preprocessed) x_ref data is a 2D ndarray self.x_ref = self._check_x(self.x_ref, x_ref=True) # Other attributes self.preprocess_fn = preprocess_fn self.n = len(x_ref) self.n_bootstraps = n_bootstraps # nb of samples used to estimate thresholds self.verbose = verbose # compute number of features for the univariate tests if isinstance(n_features, int): self.n_features = n_features elif not isinstance(preprocess_fn, Callable): # infer features from preprocessed reference data self.n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1] else: # infer number of features after applying preprocessing step x = self.preprocess_fn(x_ref[0:1]) self.n_features = x.reshape(x.shape[0], -1).shape[-1] # store input shape for save and load functionality self.input_shape = get_input_shape(input_shape, x_ref) # set metadata self.meta['detector_type'] = 'online' self.meta['data_type'] = data_type