Esempio n. 1
0
    def predict(self, X: Union[np.ndarray, list], drift_type: str = 'batch',
                return_p_val: bool = True, return_distance: bool = True) \
            -> Dict[Dict[str, str], Dict[str, Union[np.ndarray, int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        X
            Batch of instances.
        drift_type
            Predict drift at the 'feature' or 'batch' level. For 'batch', the K-S statistics for
            each feature are aggregated using the Bonferroni or False Discovery Rate correction.
        return_p_val
            Whether to return feature level p-values.
        return_distance
            Whether to return the K-S statistic between the features of the new batch and reference data.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift predictions and both feature and batch level drift scores.
        'data' contains the drift prediction and optionally the feature level p-values,
         threshold after multivariate correction if needed and K-S statistics.
        """
        # compute drift scores
        p_vals, dist = self.score(X)

        # values below p-value threshold are drift
        if drift_type == 'feature':
            drift_pred = (p_vals < self.p_val).astype(int)
        elif drift_type == 'batch' and self.correction == 'bonferroni':
            threshold = self.p_val / self.n_features
            drift_pred = int((p_vals < threshold).any())
        elif drift_type == 'batch' and self.correction == 'fdr':
            drift_pred, threshold = fdr(p_vals, q_val=self.p_val)
        else:
            raise ValueError(
                '`drift_type` needs to be either `feature` or `batch`.')

        # update reference dataset
        if (isinstance(self.update_X_ref, dict)
                and self.preprocess_fn is not None and self.preprocess_X_ref):
            X = self.preprocess_fn(X)
        self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref)
        # used for reservoir sampling
        self.n += X.shape[0]  # type: ignore

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_vals
            cd['data'][
                'threshold'] = self.p_val if drift_type == 'feature' else threshold
        if return_distance:
            cd['data']['distance'] = dist
        return cd
Esempio n. 2
0
def test_fdr(fdr_params):
    q_val, p_vals = fdr_params
    if p_vals['is_below'] and p_vals['p_val'].max() == 0:
        p_val = p_vals['p_val'] + q_val - 1e-5
    elif not p_vals['is_below'] and p_vals['p_val'].max() == 0:
        p_val = p_vals['p_val'] + q_val
    else:
        p_val = p_vals['p_val'].copy()
    below_threshold = fdr(p_val, q_val)
    assert below_threshold == p_vals['is_below']
Esempio n. 3
0
def test_fdr(fdr_params):
    q_val, p_vals = fdr_params
    if p_vals['is_below'] and p_vals['p_val'].max() == 0:
        p_val = p_vals['p_val'] + q_val - 1e-5
    elif not p_vals['is_below'] and p_vals['p_val'].max() == 0:
        p_val = p_vals['p_val'] + q_val
    else:
        p_val = p_vals['p_val'].copy()
    below_threshold, thresholds = fdr(p_val, q_val)
    assert below_threshold == p_vals['is_below']
    assert isinstance(thresholds, (np.ndarray, float))