Esempio n. 1
0
    def predict(self, X: Union[np.ndarray, list], drift_type: str = 'batch',
                return_p_val: bool = True, return_distance: bool = True) \
            -> Dict[Dict[str, str], Dict[str, Union[np.ndarray, int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        X
            Batch of instances.
        drift_type
            Predict drift at the 'feature' or 'batch' level. For 'batch', the K-S statistics for
            each feature are aggregated using the Bonferroni or False Discovery Rate correction.
        return_p_val
            Whether to return feature level p-values.
        return_distance
            Whether to return the K-S statistic between the features of the new batch and reference data.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift predictions and both feature and batch level drift scores.
        'data' contains the drift prediction and optionally the feature level p-values,
         threshold after multivariate correction if needed and K-S statistics.
        """
        # compute drift scores
        p_vals, dist = self.score(X)

        # values below p-value threshold are drift
        if drift_type == 'feature':
            drift_pred = (p_vals < self.p_val).astype(int)
        elif drift_type == 'batch' and self.correction == 'bonferroni':
            threshold = self.p_val / self.n_features
            drift_pred = int((p_vals < threshold).any())
        elif drift_type == 'batch' and self.correction == 'fdr':
            drift_pred, threshold = fdr(p_vals, q_val=self.p_val)
        else:
            raise ValueError(
                '`drift_type` needs to be either `feature` or `batch`.')

        # update reference dataset
        if (isinstance(self.update_X_ref, dict)
                and self.preprocess_fn is not None and self.preprocess_X_ref):
            X = self.preprocess_fn(X)
        self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref)
        # used for reservoir sampling
        self.n += X.shape[0]  # type: ignore

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_vals
            cd['data'][
                'threshold'] = self.p_val if drift_type == 'feature' else threshold
        if return_distance:
            cd['data']['distance'] = dist
        return cd
Esempio n. 2
0
    def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True,
                return_distance: bool = True, return_probs: bool = True, return_model: bool = True) \
            -> Dict[str, Dict[str, Union[str, int, float, Callable]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        x
            Batch of instances.
        return_p_val
            Whether to return the p-value of the test.
        return_distance
            Whether to return a notion of strength of the drift.
            K-S test stat if binarize_preds=False, otherwise relative error reduction.
        return_probs
            Whether to return the instance level classifier probabilities for the reference and test data
            (0=reference data, 1=test data).
        return_model
            Whether to return the updated model trained to discriminate reference and test instances.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the p-value, performance of the classifier
        relative to its expectation under the no-change null, the out-of-fold classifier model
        prediction probabilities on the reference and test data, and the trained model.
        """
        # compute drift scores
        p_val, dist, probs_ref, probs_test = self.score(x)
        drift_pred = int(p_val < self.p_val)

        # update reference dataset
        if isinstance(
                self.update_x_ref, dict
        ) and self.preprocess_fn is not None and self.preprocess_x_ref:
            x = self.preprocess_fn(x)
        # TODO: TBD: can `x` ever be a `list` after pre-processing? update_references and downstream functions
        # don't support list inputs and without the type: ignore[arg-type] mypy complains
        self.x_ref = update_reference(
            self.x_ref, x, self.n, self.update_x_ref)  # type: ignore[arg-type]
        # used for reservoir sampling
        self.n += len(x)

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
        if return_probs:
            cd['data']['probs_ref'] = probs_ref
            cd['data']['probs_test'] = probs_test
        if return_model:
            cd['data']['model'] = self.model
        return cd
Esempio n. 3
0
    def predict(self,  # type: ignore[override]
                x: Union[np.ndarray, list], c: np.ndarray,
                return_p_val: bool = True, return_distance: bool = True, return_coupling: bool = False) \
            -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data, given the provided context.

        Parameters
        ----------
        x
            Batch of instances.
        c
            Context associated with batch of instances.
        return_p_val
            Whether to return the p-value of the permutation test.
        return_distance
            Whether to return the conditional MMD test statistic between the new batch and reference data.
        return_coupling
            Whether to return the coupling matrices.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the p-value, threshold, conditional MMD test statistic
        and coupling matrices.
        """
        # compute drift scores
        p_val, dist, distance_threshold, coupling = self.score(x, c)
        drift_pred = int(p_val < self.p_val)

        # update reference dataset
        if isinstance(
                self.update_ref, dict
        ) and self.preprocess_fn is not None and self.preprocess_x_ref:
            x = self.preprocess_fn(x)
        self.x_ref = update_reference(
            self.x_ref, x, self.n, self.update_ref)  # type: ignore[arg-type]
        self.c_ref = update_reference(
            self.c_ref, c, self.n, self.update_ref)  # type: ignore[arg-type]
        # used for reservoir sampling
        self.n += len(x)

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
            cd['data']['distance_threshold'] = distance_threshold
        if return_coupling:
            cd['data']['coupling_xx'] = coupling[0]
            cd['data']['coupling_yy'] = coupling[1]
            cd['data']['coupling_xy'] = coupling[2]
        return cd
Esempio n. 4
0
 def predict(
     self,
     X: np.ndarray,
     drift_type: str = "batch",
     return_p_val: bool = True
 ) -> Dict[Dict[str, str], Dict[str, np.ndarray]]:
     cd = concept_drift_dict()
     cd["data"]["is_drift"] = self.expect_return_is_drift
     return cd
 def predict(
     self,
     X: np.ndarray,
     drift_type: str = "batch",
     return_p_val: bool = True
 ) -> Dict[Dict[str, str], Dict[str, np.ndarray]]:
     cd = concept_drift_dict()
     cd["data"]["is_drift"] = self.expect_return_is_drift
     cd["data"]["distance"] = [0.1, 0.2, 0.3]
     cd["data"]["p_val"] = [0.1, 0.2, 0.3]
     cd["data"]["threshold"] = 0.1
     return cd
Esempio n. 6
0
    def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True, return_distance: bool = True) \
            -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        x
            Batch of instances.
        return_p_val
            Whether to return the p-value of the permutation test.
        return_distance
            Whether to return the LSDD metric between the new batch and reference data.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the p-value, threshold and LSDD metric.
        """
        # compute drift scores
        p_val, dist, dist_permutations = self.score(x)
        drift_pred = int(p_val < self.p_val)

        # compute distance threshold
        idx_threshold = int(self.p_val * len(dist_permutations))
        distance_threshold = np.sort(dist_permutations)[::-1][idx_threshold]

        # update reference dataset
        if isinstance(self.update_x_ref, dict):
            if self.preprocess_fn is not None and self.preprocess_x_ref:
                x = self.preprocess_fn(x)
                x = self._normalize(x)  # type: ignore
            elif self.preprocess_fn is None:
                x = self._normalize(x)  # type: ignore
            else:
                pass
        self.x_ref = update_reference(self.x_ref, x, self.n, self.update_x_ref)
        # used for reservoir sampling
        self.n += len(x)  # type: ignore

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
            cd['data']['distance_threshold'] = distance_threshold
        return cd
Esempio n. 7
0
    def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True,
                return_distance: bool = True, return_kernel: bool = True) \
            -> Dict[Dict[str, str], Dict[str, Union[int, float, Callable]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        x
            Batch of instances.
        return_p_val
            Whether to return the p-value of the permutation test.
        return_distance
            Whether to return the MMD metric between the new batch and reference data.
        return_kernel
            Whether to return the updated kernel trained to discriminate reference and test instances.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the detector's metadata.
        'data' contains the drift prediction and optionally the p-value, threshold, MMD metric and
            trained kernel.
        """
        # compute drift scores
        p_val, dist, distance_threshold = self.score(x)
        drift_pred = int(p_val < self.p_val)

        # update reference dataset
        if isinstance(
                self.update_x_ref, dict
        ) and self.preprocess_fn is not None and self.preprocess_x_ref:
            x = self.preprocess_fn(x)
        self.x_ref = update_reference(
            self.x_ref, x, self.n, self.update_x_ref)  # type: ignore[arg-type]
        # used for reservoir sampling
        self.n += len(x)

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
            cd['data']['distance_threshold'] = distance_threshold
        if return_kernel:
            cd['data']['kernel'] = self.kernel
        return cd
Esempio n. 8
0
    def predict(
        self,
        x_t: Union[np.ndarray, Any],
        return_test_stat: bool = True,
    ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether the most recent window of data has drifted from the reference data.

        Parameters
        ----------
        x_t
            A single instance to be added to the test-window.
        return_test_stat
            Whether to return the test statistic and threshold.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the test-statistic and threshold.
        """
        self.t += 1

        # preprocess if necessary
        if isinstance(self.preprocess_fn, Callable):  # type: ignore
            x_t = x_t[None, :] if isinstance(x_t, np.ndarray) else [x_t]
            x_t = self.preprocess_fn(x_t)[0]  # type: ignore

        # update test window and return updated test stat
        test_stat = self.score(x_t)
        threshold = self.get_threshold(self.t)
        drift_pred = 0 if test_stat is None else int(test_stat > threshold)

        self.test_stats = np.concatenate(
            [self.test_stats, np.array([test_stat])])
        self.drift_preds = np.concatenate(
            [self.drift_preds, np.array([drift_pred])])

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        cd['data']['time'] = self.t
        cd['data']['ert'] = self.ert
        if return_test_stat:
            cd['data']['test_stat'] = test_stat
            cd['data']['threshold'] = threshold

        return cd
Esempio n. 9
0
    def predict(
        self,
        x_t: Union[np.ndarray, Any],
        return_test_stat: bool = True,
    ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether the most recent window(s) of data have drifted from the reference data.

        Parameters
        ----------
        x_t
            A single instance to be added to the test-window(s).
        return_test_stat
            Whether to return the test statistic and threshold.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the test-statistic and threshold.
        """
        # Compute test stat and check for drift
        test_stats = self.score(x_t)
        thresholds = self.get_threshold(
            self.t - 1
        )  # Note t-1 here, has we wish to use the unconditional thresholds
        drift_pred = self._check_drift(test_stats, thresholds)

        # Update results attributes
        self.test_stats = np.concatenate(
            [self.test_stats, test_stats[None, :, :]])
        self.drift_preds = np.concatenate(
            [self.drift_preds, np.array([drift_pred])])

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        cd['data']['time'] = self.t
        cd['data']['ert'] = self.ert
        if return_test_stat:
            cd['data']['test_stat'] = test_stats
            cd['data']['threshold'] = thresholds

        return cd
Esempio n. 10
0
    def predict(self, x: np.ndarray,  return_p_val: bool = True,
                return_distance: bool = True) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        x
            Batch of instances.
        return_p_val
            Whether to return the p-value of the test.
        return_distance
            Whether to return a notion of strength of the drift.
            K-S test stat if binarize_preds=False, otherwise relative error reduction.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the performance of the classifier
            relative to its expectation under the no-change null.
        """
        # compute drift scores
        p_val, dist = self.score(x)
        drift_pred = int(p_val < self.p_val)

        # update reference dataset
        if isinstance(self.update_x_ref, dict) and self.preprocess_fn is not None and self.preprocess_x_ref:
            x = self.preprocess_fn(x)
        self.x_ref = update_reference(self.x_ref, x, self.n, self.update_x_ref)
        # used for reservoir sampling
        self.n += x.shape[0]  # type: ignore

        # populate drift dict
        # TODO: add instance level feedback
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
        return cd
Esempio n. 11
0
    def predict(self, X: Union[np.ndarray, list], return_p_val: bool = True,
                return_distance: bool = True) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        X
            Batch of instances.
        return_p_val
            Whether to return the p-value of the permutation test.
        return_distance
            Whether to return the MMD metric between the new batch and reference data.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the p-value, threshold and MMD metric.
        """
        # compute drift scores
        p_val, dist = self.score(X)
        drift_pred = int(p_val < self.p_val)

        # update reference dataset
        if (isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None
                and self.preprocess_X_ref):
            X = self.preprocess_fn(X)
        self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref)
        # used for reservoir sampling
        self.n += X.shape[0]  # type: ignore

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_p_val:
            cd['data']['p_val'] = p_val
            cd['data']['threshold'] = self.p_val
        if return_distance:
            cd['data']['distance'] = dist
        return cd
Esempio n. 12
0
    def predict(
        self,
        x_t: Union[np.ndarray, Any],
        return_test_stat: bool = True,
    ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether the most recent window of data has drifted from the reference data.

        Parameters
        ----------
        x_t
            A single instance to be added to the test-window.
        return_test_stat
            Whether to return the test statistic and threshold.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the test-statistic and threshold.
        """
        # Compute test stat and check for drift
        test_stat = self.score(x_t)
        threshold = self.get_threshold(self.t)
        drift_pred = int(test_stat > threshold)

        self.test_stats = np.concatenate(
            [self.test_stats, np.array([test_stat])])
        self.drift_preds = np.concatenate(
            [self.drift_preds, np.array([drift_pred])])

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        cd['data']['time'] = self.t
        cd['data']['ert'] = self.ert
        if return_test_stat:
            cd['data']['test_stat'] = test_stat
            cd['data']['threshold'] = threshold

        return cd
Esempio n. 13
0
    def predict(self, X: Union[np.ndarray, list], return_metric: bool = True) \
            -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
        """
        Predict whether a batch of data has drifted from the reference data.

        Parameters
        ----------
        X
            Batch of instances.
        return_metric
            Whether to return the drift metric from the detector.

        Returns
        -------
        Dictionary containing 'meta' and 'data' dictionaries.
        'meta' has the model's metadata.
        'data' contains the drift prediction and optionally the drift metric and threshold.
        """
        # compute drift scores
        drift_metric = self.score(X)
        drift_pred = int(drift_metric > self.threshold)

        # update reference dataset
        if isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None and self.preprocess_X_ref:
            X = self.preprocess_fn(X)
        self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref)
        # used for reservoir sampling
        self.n += X.shape[0]  # type: ignore

        # populate drift dict
        cd = concept_drift_dict()
        cd['meta'] = self.meta
        cd['data']['is_drift'] = drift_pred
        if return_metric:
            cd['data'][self.metric_name] = drift_metric
            cd['data']['threshold'] = self.threshold
        return cd