def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        batch_size: int,
        eps_multiplier: float,
        ub_pct_poison,
        nb_classes: int,
    ) -> None:
        """
        Create an :class:`.SpectralSignatureDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param batch_size: Size of batches.
        :param eps_multiplier:
        :param ub_pct_poison:
        :param nb_classes: Number of classes.
        """
        super().__init__(classifier, x_train, y_train)
        self.batch_size = batch_size
        self.eps_multiplier = eps_multiplier
        self.ub_pct_poison = ub_pct_poison
        self.nb_classes = nb_classes
        self.y_train_sparse = np.argmax(y_train, axis=1)
        self.evaluator = GroundTruthEvaluator()
        self._check_params()
Esempio n. 2
0
    def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        expected_pp_poison: float = 0.33,
        batch_size: int = 128,
        eps_multiplier: float = 1.5,
    ) -> None:
        """
        Create an :class:`.SpectralSignatureDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param expected_pp_poison: The expected percentage of poison in the dataset
        :param batch_size: The batch size for predictions
        :param eps_multiplier: The multiplier to add to the previous expectation. Numbers higher than one represent
                               a potentially higher false positive rate, but may detect more poison samples
        """
        super().__init__(classifier, x_train, y_train)
        self.classifier: "CLASSIFIER_NEURALNETWORK_TYPE" = classifier
        self.batch_size = batch_size
        self.eps_multiplier = eps_multiplier
        self.expected_pp_poison = expected_pp_poison
        self.y_train = y_train
        self.evaluator = GroundTruthEvaluator()
        self._check_params()
Esempio n. 3
0
    def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: Optional[np.ndarray],
        y_train: Optional[np.ndarray],
        generator: Optional[DataGenerator] = None,
    ) -> None:
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: A dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param generator: A data generator to be used instead of `x_train` and `y_train`.
        """
        super().__init__(classifier, x_train, y_train)
        self.nb_clusters = 2
        self.clustering_method = "KMeans"
        self.nb_dims = 10
        self.reduce = "PCA"
        self.cluster_analysis = "smaller"
        self.generator = generator
        self.activations_by_class: List[np.ndarray] = []
        self.clusters_by_class: List[np.ndarray] = []
        self.assigned_clean_by_class: List[np.ndarray] = []
        self.is_clean_by_class: List[np.ndarray] = []
        self.errors_by_class: List[np.ndarray] = []
        self.red_activations_by_class: List[np.ndarray] = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst: List[int] = []
        self.confidence_level: List[float] = []
        self.poisonous_clusters: List[List[np.ndarray]] = []
        self.clusterer = MiniBatchKMeans(n_clusters=self.nb_clusters)
        self._check_params()
Esempio n. 4
0
    def __init__(
        self,
        classifier: "CLASSIFIER_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        x_val: np.ndarray,
        y_val: np.ndarray,
        perf_func: Union[str, Callable] = "accuracy",
        pp_cal: float = 0.2,
        pp_quiz: float = 0.2,
        calibrated: bool = True,
        eps: float = 0.1,
    ):
        """
        Create an :class:`.RONIDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param x_val: Trusted data points.
        :param y_train: Trusted data labels.
        :param perf_func: Performance function to use.
        :param pp_cal: Percent of training data used for calibration.
        :param pp_quiz: Percent of training data used for quiz set.
        :param calibrated: True if using the calibrated form of RONI.
        :param eps: performance threshold if using uncalibrated RONI.
        """
        super().__init__(classifier, x_train, y_train)
        n_points = len(x_train)
        quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points))
        self.calibrated = calibrated
        self.x_quiz = np.copy(self.x_train[quiz_idx])
        self.y_quiz = np.copy(self.y_train[quiz_idx])
        if self.calibrated:
            _, self.x_cal, _, self.y_cal = train_test_split(self.x_train,
                                                            self.y_train,
                                                            test_size=pp_cal,
                                                            shuffle=True)
        self.eps = eps
        self.evaluator = GroundTruthEvaluator()
        self.x_val = x_val
        self.y_val = y_val
        self.perf_func = perf_func
        self.is_clean_lst: List[int] = []
        self._check_params()
    def __init__(
        self,
        classifier: "Classifier",
        x_train: np.ndarray,
        y_train: np.ndarray,
        p_train: np.ndarray,
        x_val: Optional[np.ndarray] = None,
        y_val: Optional[np.ndarray] = None,
        eps: float = 0.2,
        perf_func: str = "accuracy",
        pp_valid: float = 0.2,
        **kwargs
    ) -> None:
        """
        Create an :class:`.ProvenanceDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: dataset used to train the classifier.
        :param y_train: labels used to train the classifier.
        :param p_train: provenance features for each training data point as one hot vectors.
        :param x_val: Validation data for defense.
        :param y_val: Validation labels for defense.
        :param eps: Threshold for performance shift in suspicious data.
        :param perf_func: performance function used to evaluate effectiveness of defense.
        :param pp_valid: The percent of training data to use as validation data (for defense without validation data).
        """
        super(ProvenanceDefense, self).__init__(classifier, x_train, y_train)
        self.p_train = p_train
        self.num_devices = self.p_train.shape[1]
        self.x_val = x_val
        self.y_val = y_val
        self.eps = eps
        self.perf_func = perf_func
        self.pp_valid = pp_valid
        self.assigned_clean_by_device: List[np.ndarray] = []
        self.is_clean_by_device: List[np.ndarray] = []
        self.errors_by_device: Optional[np.ndarray] = None
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst: Optional[np.ndarray] = None
        self._check_params()
class SpectralSignatureDefense(PoisonFilteringDefence):
    """
    Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures
    """

    defence_params = PoisonFilteringDefence.defence_params + [
        "x_train",
        "y_train",
        "batch_size",
        "eps_multiplier",
        "ub_pct_poison",
        "nb_classes",
    ]

    def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        batch_size: int,
        eps_multiplier: float,
        ub_pct_poison,
        nb_classes: int,
    ) -> None:
        """
        Create an :class:`.SpectralSignatureDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param batch_size: Size of batches.
        :param eps_multiplier:
        :param ub_pct_poison:
        :param nb_classes: Number of classes.
        """
        super().__init__(classifier, x_train, y_train)
        self.batch_size = batch_size
        self.eps_multiplier = eps_multiplier
        self.ub_pct_poison = ub_pct_poison
        self.nb_classes = nb_classes
        self.y_train_sparse = np.argmax(y_train, axis=1)
        self.evaluator = GroundTruthEvaluator()
        self._check_params()

    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        If ground truth is known, this function returns a confusion matrix in the form of a JSON object.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")
        is_clean_by_class = SpectralSignatureDefense.split_by_class(
            is_clean, self.y_train_sparse, self.nb_classes)
        _, predicted_clean = self.detect_poison()
        predicted_clean_by_class = SpectralSignatureDefense.split_by_class(
            predicted_clean, self.y_train_sparse, self.nb_classes)

        _, conf_matrix_json = self.evaluator.analyze_correctness(
            predicted_clean_by_class, is_clean_by_class)

        return conf_matrix_json

    def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]:
        """
        Returns poison detected and a report.

        :return: (report, is_clean_lst):
                where a report is a dictionary containing the index as keys the outlier score of suspected poisons as
                values where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and
                is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        """
        self.set_params(**kwargs)

        nb_layers = len(self.classifier.layer_names)
        features_x_poisoned = self.classifier.get_activations(
            self.x_train, layer=nb_layers - 1, batch_size=self.batch_size)

        features_split = SpectralSignatureDefense.split_by_class(
            features_x_poisoned, self.y_train_sparse, self.nb_classes)
        score_by_class, keep_by_class = [], []
        for idx, feature in enumerate(features_split):
            score = SpectralSignatureDefense.spectral_signature_scores(feature)
            score_cutoff = np.quantile(
                score, max(1 - self.eps_multiplier * self.ub_pct_poison, 0.0))
            score_by_class.append(score)
            keep_by_class.append(score < score_cutoff)

        base_indices_by_class = SpectralSignatureDefense.split_by_class(
            np.arange(self.y_train_sparse.shape[0]),
            self.y_train_sparse,
            self.nb_classes,
        )
        is_clean_lst = np.zeros_like(self.y_train_sparse, dtype=np.int)
        report = {}

        for keep_booleans, all_scores, indices in zip(keep_by_class,
                                                      score_by_class,
                                                      base_indices_by_class):
            for keep_boolean, all_score, idx in zip(keep_booleans, all_scores,
                                                    indices):
                if keep_boolean:
                    is_clean_lst[idx] = 1
                else:
                    report[idx] = all_score[0]
        return report, is_clean_lst

    @staticmethod
    def spectral_signature_scores(matrix_r: np.ndarray) -> np.ndarray:
        """
        :param matrix_r: Matrix of feature representations.
        :return: Outlier scores for each observation based on spectral signature.
        """
        matrix_m = matrix_r - np.mean(matrix_r, axis=0)
        # Following Algorithm #1 in paper, use SVD of centered features, not of covariance
        _, _, matrix_v = np.linalg.svd(matrix_m, full_matrices=False)
        eigs = matrix_v[:1]
        score = np.matmul(matrix_m, np.transpose(eigs))**2
        return score

    @staticmethod
    def split_by_class(data: np.ndarray, labels: np.ndarray,
                       num_classes: int) -> List[np.ndarray]:
        """
        :param data: Features.
        :param labels: Labels, not in one-hot representations.
        :param num_classes: Number of classes of labels.
        :return: List of numpy arrays of features split by labels.
        """
        split: List[List[int]] = [[] for _ in range(num_classes)]
        for idx, label in enumerate(labels):
            split[int(label)].append(data[idx])
        return [np.asarray(dat) for dat in split]

    def _check_params(self) -> None:
        if self.batch_size < 0:
            raise ValueError(
                "Batch size must be positive integer. Unsupported batch size: "
                + str(self.batch_size))
        if self.eps_multiplier < 0:
            raise ValueError(
                "eps_multiplier must be positive. Unsupported value: " +
                str(self.eps_multiplier))
        if self.ub_pct_poison < 0 or self.ub_pct_poison > 1:
            raise ValueError(
                "ub_pct_poison must be between 0 and 1. Unsupported value: " +
                str(self.ub_pct_poison))
Esempio n. 7
0
class RONIDefense(PoisonFilteringDefence):
    """
    Close implementation based on description in Nelson
    'Behavior of Machine Learning Algorithms in Adversarial Environments' Ch. 4.4

    | Textbook link: https://people.eecs.berkeley.edu/~adj/publications/paper-files/EECS-2010-140.pdf
    """

    defence_params = [
        "classifier",
        "x_train",
        "y_train",
        "x_val",
        "y_val",
        "perf_func",
        "calibrated",
        "eps",
    ]

    def __init__(
        self,
        classifier: "CLASSIFIER_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        x_val: np.ndarray,
        y_val: np.ndarray,
        perf_func: Union[str, Callable] = "accuracy",
        pp_cal: float = 0.2,
        pp_quiz: float = 0.2,
        calibrated: bool = True,
        eps: float = 0.1,
    ):
        """
        Create an :class:`.RONIDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param x_val: Trusted data points.
        :param y_train: Trusted data labels.
        :param perf_func: Performance function to use.
        :param pp_cal: Percent of training data used for calibration.
        :param pp_quiz: Percent of training data used for quiz set.
        :param calibrated: True if using the calibrated form of RONI.
        :param eps: performance threshold if using uncalibrated RONI.
        """
        super().__init__(classifier, x_train, y_train)
        n_points = len(x_train)
        quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points))
        self.calibrated = calibrated
        self.x_quiz = np.copy(self.x_train[quiz_idx])
        self.y_quiz = np.copy(self.y_train[quiz_idx])
        if self.calibrated:
            _, self.x_cal, _, self.y_cal = train_test_split(self.x_train,
                                                            self.y_train,
                                                            test_size=pp_cal,
                                                            shuffle=True)
        self.eps = eps
        self.evaluator = GroundTruthEvaluator()
        self.x_val = x_val
        self.y_val = y_val
        self.perf_func = perf_func
        self.is_clean_lst: List[int] = []
        self._check_params()

    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        Returns confusion matrix.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        self.set_params(**kwargs)
        if len(self.is_clean_lst) == 0:
            self.detect_poison()

        if is_clean is None or len(is_clean) != len(self.is_clean_lst):
            raise ValueError("Invalid value for is_clean.")

        _, conf_matrix = self.evaluator.analyze_correctness(
            [self.is_clean_lst], [is_clean])  # type: ignore
        return conf_matrix

    def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]:
        """
        Returns poison detected and a report.

        :param kwargs: A dictionary of detection-specific parameters.
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the provenance detection method
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        """
        self.set_params(**kwargs)

        x_suspect = self.x_train
        y_suspect = self.y_train
        x_trusted = self.x_val
        y_trusted = self.y_val

        self.is_clean_lst = [1 for _ in range(len(x_suspect))]
        report = {}

        before_classifier = deepcopy(self.classifier)
        before_classifier.fit(x_suspect, y_suspect)

        for idx in np.random.permutation(len(x_suspect)):
            x_i = x_suspect[idx]
            y_i = y_suspect[idx]

            after_classifier = deepcopy(before_classifier)
            after_classifier.fit(x=np.vstack([x_trusted, x_i]),
                                 y=np.vstack([y_trusted, y_i]))
            acc_shift = performance_diff(
                before_classifier,
                after_classifier,
                self.x_quiz,
                self.y_quiz,
                perf_function=self.perf_func,
            )
            # print(acc_shift, median, std_dev)
            if self.is_suspicious(before_classifier, acc_shift):
                self.is_clean_lst[idx] = 0
                report[idx] = acc_shift
            else:
                before_classifier = after_classifier
                x_trusted = np.vstack([x_trusted, x_i])
                y_trusted = np.vstack([y_trusted, y_i])

        return report, self.is_clean_lst

    def is_suspicious(self, before_classifier: "CLASSIFIER_TYPE",
                      perf_shift: float) -> bool:
        """
        Returns True if a given performance shift is suspicious

        :param before_classifier: The classifier without untrusted data.
        :param perf_shift: A shift in performance.
        :return: True if a given performance shift is suspicious, false otherwise.
        """
        if self.calibrated:
            median, std_dev = self.get_calibration_info(before_classifier)
            return perf_shift < median - 3 * std_dev

        return perf_shift < -self.eps

    def get_calibration_info(
            self, before_classifier: "CLASSIFIER_TYPE"
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Calculate the median and standard deviation of the accuracy shifts caused
        by the calibration set.

        :param before_classifier: The classifier trained without suspicious point.
        :return: A tuple consisting of `(median, std_dev)`.
        """
        accs = []

        for x_c, y_c in zip(self.x_cal, self.y_cal):
            after_classifier = deepcopy(before_classifier)
            after_classifier.fit(x=np.vstack([self.x_val, x_c]),
                                 y=np.vstack([self.y_val, y_c]))
            accs.append(
                performance_diff(
                    before_classifier,
                    after_classifier,
                    self.x_quiz,
                    self.y_quiz,
                    perf_function=self.perf_func,
                ))

        return np.median(accs), np.std(accs)

    def _check_params(self) -> None:
        if len(self.x_train) != len(self.y_train):
            raise ValueError("`x_train` and `y_train` do not match shape.")

        if self.eps < 0:
            raise ValueError("Value of `eps` must be at least 0.")
Esempio n. 8
0
class SpectralSignatureDefense(PoisonFilteringDefence):
    """
    Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures
    """

    defence_params = PoisonFilteringDefence.defence_params + [
        "x_train",
        "y_train",
        "batch_size",
        "eps_multiplier",
        "expected_pp_poison",
    ]

    def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        expected_pp_poison: float = 0.33,
        batch_size: int = 128,
        eps_multiplier: float = 1.5,
    ) -> None:
        """
        Create an :class:`.SpectralSignatureDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: Dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param expected_pp_poison: The expected percentage of poison in the dataset
        :param batch_size: The batch size for predictions
        :param eps_multiplier: The multiplier to add to the previous expectation. Numbers higher than one represent
                               a potentially higher false positive rate, but may detect more poison samples
        """
        super().__init__(classifier, x_train, y_train)
        self.classifier: "CLASSIFIER_NEURALNETWORK_TYPE" = classifier
        self.batch_size = batch_size
        self.eps_multiplier = eps_multiplier
        self.expected_pp_poison = expected_pp_poison
        self.y_train = y_train
        self.evaluator = GroundTruthEvaluator()
        self._check_params()

    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        If ground truth is known, this function returns a confusion matrix in the form of a JSON object.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")
        is_clean_by_class = segment_by_class(is_clean, self.y_train,
                                             self.classifier.nb_classes)
        _, predicted_clean = self.detect_poison()
        predicted_clean_by_class = segment_by_class(predicted_clean,
                                                    self.y_train,
                                                    self.classifier.nb_classes)

        _, conf_matrix_json = self.evaluator.analyze_correctness(
            predicted_clean_by_class, is_clean_by_class)

        return conf_matrix_json

    def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]:
        """
        Returns poison detected and a report.

        :return: (report, is_clean_lst):
                where a report is a dictionary containing the index as keys the outlier score of suspected poisons as
                values where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and
                is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        """
        self.set_params(**kwargs)

        if self.classifier.layer_names is not None:
            nb_layers = len(self.classifier.layer_names)
        else:
            raise ValueError("No layer names identified.")
        features_x_poisoned = self.classifier.get_activations(
            self.x_train, layer=nb_layers - 1, batch_size=self.batch_size)
        if not isinstance(features_x_poisoned, np.ndarray):
            raise ValueError("Wrong type detected.")

        if features_x_poisoned is not None:
            features_split = segment_by_class(features_x_poisoned,
                                              self.y_train,
                                              self.classifier.nb_classes)
        else:
            raise ValueError("Activation are `None`.")
        score_by_class = []
        keep_by_class = []

        for idx, feature in enumerate(features_split):
            # Check for empty list
            if len(feature):  # pylint: disable=C1801
                score = SpectralSignatureDefense.spectral_signature_scores(
                    np.vstack(feature))  # type: ignore
                score_cutoff = np.quantile(
                    score,
                    max(1 - self.eps_multiplier * self.expected_pp_poison,
                        0.0))
                score_by_class.append(score)
                keep_by_class.append(score < score_cutoff)
            else:
                score_by_class.append([0])  # type: ignore
                keep_by_class.append([True])

        base_indices_by_class = segment_by_class(
            np.arange(self.y_train.shape[0]),
            self.y_train,
            self.classifier.nb_classes,
        )
        is_clean_lst = [0] * self.y_train.shape[0]
        report = {}

        for keep_booleans, all_scores, indices in zip(keep_by_class,
                                                      score_by_class,
                                                      base_indices_by_class):
            for keep_boolean, all_score, idx in zip(keep_booleans, all_scores,
                                                    indices):
                if keep_boolean:
                    is_clean_lst[idx] = 1
                else:
                    report[idx] = all_score[0]

        return report, is_clean_lst

    def _check_params(self) -> None:
        if self.batch_size < 0:
            raise ValueError(
                "Batch size must be positive integer. Unsupported batch size: "
                + str(self.batch_size))
        if self.eps_multiplier < 0:
            raise ValueError(
                "eps_multiplier must be positive. Unsupported value: " +
                str(self.eps_multiplier))
        if self.expected_pp_poison < 0 or self.expected_pp_poison > 1:
            raise ValueError(
                "expected_pp_poison must be between 0 and 1. Unsupported value: "
                + str(self.expected_pp_poison))

    @staticmethod
    def spectral_signature_scores(matrix_r: np.ndarray) -> np.ndarray:
        """
        :param matrix_r: Matrix of feature representations.
        :return: Outlier scores for each observation based on spectral signature.
        """
        matrix_m = matrix_r - np.mean(matrix_r, axis=0)
        # Following Algorithm #1 in paper, use SVD of centered features, not of covariance
        _, _, matrix_v = np.linalg.svd(matrix_m, full_matrices=False)
        eigs = matrix_v[:1]
        corrs = np.matmul(eigs, np.transpose(matrix_r))
        score = np.expand_dims(np.linalg.norm(corrs, axis=0), axis=1)
        return score
Esempio n. 9
0
class ActivationDefence(PoisonFilteringDefence):
    """
    Method from Chen et al., 2018 performing poisoning detection based on activations clustering.

    | Paper link: https://arxiv.org/abs/1811.03728

    | Please keep in mind the limitations of defences. For more information on the limitations of this
        defence, see https://arxiv.org/abs/1905.13409 . For details on how to evaluate classifier security
        in general, see https://arxiv.org/abs/1902.06705
    """

    defence_params = ["nb_clusters", "clustering_method", "nb_dims", "reduce", "cluster_analysis", "generator"]
    valid_clustering = ["KMeans"]
    valid_reduce = ["PCA", "FastICA", "TSNE"]
    valid_analysis = ["smaller", "distance", "relative-size", "silhouette-scores"]

    TOO_SMALL_ACTIVATIONS = 32  # Threshold used to print a warning when activations are not enough

    def __init__(
        self,
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x_train: Optional[np.ndarray],
        y_train: Optional[np.ndarray],
        generator: Optional[DataGenerator] = None,
    ) -> None:
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: A dataset used to train the classifier.
        :param y_train: Labels used to train the classifier.
        :param generator: A data generator to be used instead of `x_train` and `y_train`.
        """
        super().__init__(classifier, x_train, y_train)
        self.nb_clusters = 2
        self.clustering_method = "KMeans"
        self.nb_dims = 10
        self.reduce = "PCA"
        self.cluster_analysis = "smaller"
        self.generator = generator
        self.activations_by_class: List[np.ndarray] = []
        self.clusters_by_class: List[np.ndarray] = []
        self.assigned_clean_by_class: List[np.ndarray] = []
        self.is_clean_by_class: List[np.ndarray] = []
        self.errors_by_class: List[np.ndarray] = []
        self.red_activations_by_class: List[np.ndarray] = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst: List[int] = []
        self.confidence_level: List[float] = []
        self.poisonous_clusters: List[List[np.ndarray]] = []
        self.clusterer = MiniBatchKMeans(n_clusters=self.nb_clusters)
        self._check_params()

    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        If ground truth is known, this function returns a confusion matrix in the form of a JSON object.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError("is_clean was not provided while invoking evaluate_defence.")

        self.set_params(**kwargs)

        if not self.activations_by_class and self.generator is None:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)

        (self.clusters_by_class, self.red_activations_by_class,) = self.cluster_activations()
        _, self.assigned_clean_by_class = self.analyze_clusters()

        # Now check ground truth:
        if self.generator is not None:
            batch_size = self.generator.batch_size
            num_samples = self.generator.size
            num_classes = self.classifier.nb_classes
            self.is_clean_by_class = [np.empty(0, dtype=int) for _ in range(num_classes)]

            # calculate is_clean_by_class for each batch
            for batch_idx in range(num_samples // batch_size):  # type: ignore
                _, y_batch = self.generator.get_batch()
                is_clean_batch = is_clean[batch_idx * batch_size : batch_idx * batch_size + batch_size]
                clean_by_class_batch = self._segment_by_class(is_clean_batch, y_batch)
                self.is_clean_by_class = [
                    np.append(self.is_clean_by_class[class_idx], clean_by_class_batch[class_idx])
                    for class_idx in range(num_classes)
                ]

        else:
            self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train)
        self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_class, self.is_clean_by_class
        )
        return conf_matrix_json

    # pylint: disable=W0221
    def detect_poison(self, **kwargs) -> Tuple[Dict[str, Any], List[int]]:
        """
        Returns poison detected and a report.

        :param clustering_method: clustering algorithm to be used. Currently `KMeans` is the only method supported
        :type clustering_method: `str`
        :param nb_clusters: number of clusters to find. This value needs to be greater or equal to one
        :type nb_clusters: `int`
        :param reduce: method used to reduce dimensionality of the activations. Supported methods include  `PCA`,
                       `FastICA` and `TSNE`
        :type reduce: `str`
        :param nb_dims: number of dimensions to be reduced
        :type nb_dims: `int`
        :param cluster_analysis: heuristic to automatically determine if a cluster contains poisonous data. Supported
                                 methods include `smaller` and `distance`. The `smaller` method defines as poisonous the
                                 cluster with less number of data points, while the `distance` heuristic uses the
                                 distance between the clusters.
        :type cluster_analysis: `str`
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the clustering analysis technique
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        """
        old_nb_clusters = self.nb_clusters
        self.set_params(**kwargs)
        if self.nb_clusters != old_nb_clusters:
            self.clusterer = MiniBatchKMeans(n_clusters=self.nb_clusters)

        if self.generator is not None:
            self.clusters_by_class, self.red_activations_by_class = self.cluster_activations()
            report, self.assigned_clean_by_class = self.analyze_clusters()

            batch_size = self.generator.batch_size
            num_samples = self.generator.size
            self.is_clean_lst = []

            # loop though the generator to generator a report
            for _ in range(num_samples // batch_size):  # type: ignore
                _, y_batch = self.generator.get_batch()
                indices_by_class = self._segment_by_class(np.arange(batch_size), y_batch)
                is_clean_lst = [0] * batch_size
                for class_idx, idxs in enumerate(indices_by_class):
                    for idx_in_class, idx in enumerate(idxs):
                        is_clean_lst[idx] = self.assigned_clean_by_class[class_idx][idx_in_class]
                self.is_clean_lst += is_clean_lst
            return report, self.is_clean_lst

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)
        (self.clusters_by_class, self.red_activations_by_class,) = self.cluster_activations()
        report, self.assigned_clean_by_class = self.analyze_clusters()
        # Here, assigned_clean_by_class[i][j] is 1 if the jth data point in the ith class was
        # determined to be clean by activation cluster

        # Build an array that matches the original indexes of x_train
        n_train = len(self.x_train)
        indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train)
        self.is_clean_lst = [0] * n_train

        for assigned_clean, indices_dp in zip(self.assigned_clean_by_class, indices_by_class):
            for assignment, index_dp in zip(assigned_clean, indices_dp):
                if assignment == 1:
                    self.is_clean_lst[index_dp] = 1

        return report, self.is_clean_lst

    def cluster_activations(self, **kwargs) -> Tuple[List[List[int]], List[List[int]]]:
        """
        Clusters activations and returns cluster_by_class and red_activations_by_class, where cluster_by_class[i][j] is
        the cluster to which the j-th data point in the ith class belongs and the correspondent activations reduced by
        class red_activations_by_class[i][j].

        :param kwargs: A dictionary of cluster-specific parameters.
        :return: Clusters per class and activations by class.
        """
        self.set_params(**kwargs)

        if self.generator is not None:
            batch_size = self.generator.batch_size
            num_samples = self.generator.size
            num_classes = self.classifier.nb_classes
            for batch_idx in range(num_samples // batch_size):  # type: ignore
                x_batch, y_batch = self.generator.get_batch()

                batch_activations = self._get_activations(x_batch)
                activation_dim = batch_activations.shape[-1]

                # initialize values list of lists on first run
                if batch_idx == 0:
                    self.activations_by_class = [np.empty((0, activation_dim)) for _ in range(num_classes)]
                    self.clusters_by_class = [np.empty(0, dtype=int) for _ in range(num_classes)]
                    self.red_activations_by_class = [np.empty((0, self.nb_dims)) for _ in range(num_classes)]

                activations_by_class = self._segment_by_class(batch_activations, y_batch)
                clusters_by_class, red_activations_by_class = cluster_activations(
                    activations_by_class,
                    nb_clusters=self.nb_clusters,
                    nb_dims=self.nb_dims,
                    reduce=self.reduce,
                    clustering_method=self.clustering_method,
                    generator=self.generator,
                    clusterer_new=self.clusterer,
                )

                for class_idx in range(num_classes):
                    self.activations_by_class[class_idx] = np.vstack(
                        [self.activations_by_class[class_idx], activations_by_class[class_idx]]
                    )
                    self.clusters_by_class[class_idx] = np.append(
                        self.clusters_by_class[class_idx], clusters_by_class[class_idx]
                    )
                    self.red_activations_by_class[class_idx] = np.vstack(
                        [self.red_activations_by_class[class_idx], red_activations_by_class[class_idx]]
                    )
            return self.clusters_by_class, self.red_activations_by_class

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)

        [self.clusters_by_class, self.red_activations_by_class] = cluster_activations(
            self.activations_by_class,
            nb_clusters=self.nb_clusters,
            nb_dims=self.nb_dims,
            reduce=self.reduce,
            clustering_method=self.clustering_method,
        )

        return self.clusters_by_class, self.red_activations_by_class

    def analyze_clusters(self, **kwargs) -> Tuple[Dict[str, Any], np.ndarray]:
        """
        This function analyzes the clusters according to the provided method.

        :param kwargs: A dictionary of cluster-analysis-specific parameters.
        :return: (report, assigned_clean_by_class), where the report is a dict object and assigned_clean_by_class
                 is an array of arrays that contains what data points where classified as clean.
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        analyzer = ClusteringAnalyzer()
        if self.cluster_analysis == "smaller":
            (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_size(
                self.clusters_by_class
            )
        elif self.cluster_analysis == "relative-size":
            (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_relative_size(
                self.clusters_by_class
            )
        elif self.cluster_analysis == "distance":
            (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_distance(
                self.clusters_by_class, separated_activations=self.red_activations_by_class,
            )
        elif self.cluster_analysis == "silhouette-scores":
            (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_silhouette_score(
                self.clusters_by_class, reduced_activations_by_class=self.red_activations_by_class,
            )
        else:
            raise ValueError("Unsupported cluster analysis technique " + self.cluster_analysis)

        # Add to the report current parameters used to run the defence and the analysis summary
        report = dict(list(report.items()) + list(self.get_params().items()))

        return report, self.assigned_clean_by_class

    @staticmethod
    def relabel_poison_ground_truth(
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x: np.ndarray,
        y_fix: np.ndarray,
        test_set_split: float = 0.7,
        tolerable_backdoor: float = 0.01,
        max_epochs: int = 50,
        batch_epochs: int = 10,
    ) -> Tuple[float, "CLASSIFIER_NEURALNETWORK_TYPE"]:
        """
        Revert poison attack by continue training the current classifier with `x`, `y_fix`. `test_set_split` determines
        the percentage in x that will be used as training set, while `1-test_set_split` determines how many data points
        to use for test set.

        :param classifier: Classifier to be fixed.
        :param x: Samples.
        :param y_fix: True label of `x_poison`.
        :param test_set_split: this parameter determine how much data goes to the training set.
               Here `test_set_split*len(y_fix)` determines the number of data points in `x_train`
               and `(1-test_set_split) * len(y_fix)` the number of data points in `x_test`.
        :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate.
        :param max_epochs: Maximum number of epochs that the model will be trained.
        :param batch_epochs: Number of epochs to be trained before checking current state of model.
        :return: (improve_factor, classifier).
        """
        # Split data into testing and training:
        n_train = int(len(x) * test_set_split)
        x_train, x_test = x[:n_train], x[n_train:]
        y_train, y_test = y_fix[:n_train], y_fix[n_train:]

        filename = "original_classifier" + str(time.time()) + ".p"
        ActivationDefence._pickle_classifier(classifier, filename)

        # Now train using y_fix:
        improve_factor, _ = train_remove_backdoor(
            classifier,
            x_train,
            y_train,
            x_test,
            y_test,
            tolerable_backdoor=tolerable_backdoor,
            max_epochs=max_epochs,
            batch_epochs=batch_epochs,
        )

        # Only update classifier if there was an improvement:
        if improve_factor < 0:
            classifier = ActivationDefence._unpickle_classifier(filename)
            return 0, classifier

        ActivationDefence._remove_pickle(filename)
        return improve_factor, classifier

    @staticmethod
    def relabel_poison_cross_validation(
        classifier: "CLASSIFIER_NEURALNETWORK_TYPE",
        x: np.ndarray,
        y_fix: np.ndarray,
        n_splits: int = 10,
        tolerable_backdoor: float = 0.01,
        max_epochs: int = 50,
        batch_epochs: int = 10,
    ) -> Tuple[float, "CLASSIFIER_NEURALNETWORK_TYPE"]:
        """
        Revert poison attack by continue training the current classifier with `x`, `y_fix`. `n_splits` determines the
        number of cross validation splits.

        :param classifier: Classifier to be fixed.
        :param x: Samples that were miss-labeled.
        :param y_fix: True label of `x`.
        :param n_splits: Determines how many splits to use in cross validation (only used if `cross_validation=True`).
        :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate.
        :param max_epochs: Maximum number of epochs that the model will be trained.
        :param batch_epochs: Number of epochs to be trained before checking current state of model.
        :return: (improve_factor, classifier)
        """
        # pylint: disable=E0001
        from sklearn.model_selection import KFold

        # Train using cross validation
        k_fold = KFold(n_splits=n_splits)
        KFold(n_splits=n_splits, random_state=None, shuffle=True)

        filename = "original_classifier" + str(time.time()) + ".p"
        ActivationDefence._pickle_classifier(classifier, filename)
        curr_improvement = 0

        for train_index, test_index in k_fold.split(x):
            # Obtain partition:
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y_fix[train_index], y_fix[test_index]
            # Unpickle original model:
            curr_classifier = ActivationDefence._unpickle_classifier(filename)

            new_improvement, fixed_classifier = train_remove_backdoor(
                curr_classifier,
                x_train,
                y_train,
                x_test,
                y_test,
                tolerable_backdoor=tolerable_backdoor,
                max_epochs=max_epochs,
                batch_epochs=batch_epochs,
            )
            if curr_improvement < new_improvement and new_improvement > 0:
                curr_improvement = new_improvement
                classifier = fixed_classifier
                logger.info("Selected as best model so far: %s", curr_improvement)

        ActivationDefence._remove_pickle(filename)
        return curr_improvement, classifier

    @staticmethod
    def _pickle_classifier(classifier: "CLASSIFIER_NEURALNETWORK_TYPE", file_name: str) -> None:
        """
        Pickles the self.classifier and stores it using the provided file_name in folder `art.ART_DATA_PATH`.

        :param classifier: Classifier to be pickled.
        :param file_name: Name of the file where the classifier will be pickled.
        """
        full_path = os.path.join(ART_DATA_PATH, file_name)
        folder = os.path.split(full_path)[0]
        if not os.path.exists(folder):
            os.makedirs(folder)

        with open(full_path, "wb") as f_classifier:
            pickle.dump(classifier, f_classifier)

    @staticmethod
    def _unpickle_classifier(file_name: str) -> "CLASSIFIER_NEURALNETWORK_TYPE":
        """
        Unpickles classifier using the filename provided. Function assumes that the pickle is in `art.ART_DATA_PATH`.

        :param file_name: Path of the pickled classifier relative to `ART_DATA_PATH`.
        :return: The loaded classifier.
        """
        full_path = os.path.join(ART_DATA_PATH, file_name)
        logger.info("Loading classifier from %s", full_path)
        with open(full_path, "rb") as f_classifier:
            loaded_classifier = pickle.load(f_classifier)
            return loaded_classifier

    @staticmethod
    def _remove_pickle(file_name: str) -> None:
        """
        Erases the pickle with the provided file name.

        :param file_name: File name without directory.
        """
        full_path = os.path.join(ART_DATA_PATH, file_name)
        os.remove(full_path)

    def visualize_clusters(
        self, x_raw: np.ndarray, save: bool = True, folder: str = ".", **kwargs
    ) -> List[List[List[np.ndarray]]]:
        """
        This function creates the sprite/mosaic visualization for clusters. When save=True,
        it also stores a sprite (mosaic) per cluster in ART_DATA_PATH.

        :param x_raw: Images used to train the classifier (before pre-processing).
        :param save: Boolean specifying if image should be saved.
        :param folder: Directory where the sprites will be saved inside ART_DATA_PATH folder.
        :param kwargs: a dictionary of cluster-analysis-specific parameters.
        :return: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the
                                  sprite of class i cluster j.
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        x_raw_by_class = self._segment_by_class(x_raw, self.y_train)
        x_raw_by_cluster: List[List[List[np.ndarray]]] = [
            [[] for _ in range(self.nb_clusters)] for _ in range(self.classifier.nb_classes)
        ]

        # Get all data in x_raw in the right cluster
        for n_class, cluster in enumerate(self.clusters_by_class):
            for j, assigned_cluster in enumerate(cluster):
                x_raw_by_cluster[n_class][assigned_cluster].append(x_raw_by_class[n_class][j])

        # Now create sprites:
        sprites_by_class: List[List[List[np.ndarray]]] = [
            [[] for _ in range(self.nb_clusters)] for _ in range(self.classifier.nb_classes)
        ]
        for i, class_i in enumerate(x_raw_by_cluster):
            for j, images_cluster in enumerate(class_i):
                title = "Class_" + str(i) + "_cluster_" + str(j) + "_clusterSize_" + str(len(images_cluster))
                f_name = title + ".png"
                f_name = os.path.join(folder, f_name)
                sprite = create_sprite(np.array(images_cluster))
                if save:
                    save_image(sprite, f_name)
                sprites_by_class[i][j] = sprite

        return sprites_by_class

    def plot_clusters(self, save: bool = True, folder: str = ".", **kwargs) -> None:
        """
        Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot. When
        save=True, it also stores the 3D-plot per cluster in ART_DATA_PATH.

        :param save: Boolean specifying if image should be saved.
        :param folder: Directory where the sprites will be saved inside ART_DATA_PATH folder.
        :param kwargs: a dictionary of cluster-analysis-specific parameters.
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        # Get activations reduced to 3-components:
        separated_reduced_activations = []
        for activation in self.activations_by_class:
            reduced_activations = reduce_dimensionality(activation, nb_dims=3)
            separated_reduced_activations.append(reduced_activations)

        # For each class generate a plot:
        for class_id, (labels, coordinates) in enumerate(zip(self.clusters_by_class, separated_reduced_activations)):
            f_name = ""
            if save:
                f_name = os.path.join(folder, "plot_class_" + str(class_id) + ".png")
            plot_3d(coordinates, labels, save=save, f_name=f_name)

    def _check_params(self):
        if self.nb_clusters <= 1:
            raise ValueError(
                "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.nb_clusters)
            )
        if self.nb_dims <= 0:
            raise ValueError("Wrong number of dimensions.")
        if self.clustering_method not in self.valid_clustering:
            raise ValueError("Unsupported clustering method: " + self.clustering_method)
        if self.reduce not in self.valid_reduce:
            raise ValueError("Unsupported reduction method: " + self.reduce)
        if self.cluster_analysis not in self.valid_analysis:
            raise ValueError("Unsupported method for cluster analysis method: " + self.cluster_analysis)
        if self.generator and not isinstance(self.generator, DataGenerator):
            raise TypeError("Generator must a an instance of DataGenerator")

    def _get_activations(self, x_train: Optional[np.ndarray] = None) -> np.ndarray:
        """
        Find activations from :class:`.Classifier`.
        """
        logger.info("Getting activations")

        nb_layers = len(self.classifier.layer_names)
        protected_layer = nb_layers - 1

        if self.generator is not None:
            activations = self.classifier.get_activations(
                x_train, layer=protected_layer, batch_size=self.generator.batch_size
            )
        else:
            activations = self.classifier.get_activations(self.x_train, layer=protected_layer, batch_size=128)

        # wrong way to get activations activations = self.classifier.predict(self.x_train)
        nodes_last_layer = np.shape(activations)[1]

        if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS:
            logger.warning(
                "Number of activations in last hidden layer is too small. Method may not work properly. " "Size: %s",
                str(nodes_last_layer),
            )
        return activations

    def _segment_by_class(self, data: np.ndarray, features: np.ndarray) -> List[np.ndarray]:
        """
        Returns segmented data according to specified features.

        :param data: Data to be segmented.
        :param features: Features used to segment data, e.g., segment according to predicted label or to `y_train`.
        :return: Segmented data according to specified features.
        """
        n_classes = self.classifier.nb_classes
        return segment_by_class(data, features, n_classes)
Esempio n. 10
0
class ProvenanceDefense(PoisonFilteringDefence):
    """
    Implements methods performing poisoning detection based on data provenance.

    | Paper link: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8473440
    """

    defence_params = [
        "classifier",
        "x_train",
        "y_train",
        "p_train",
        "x_val",
        "y_val",
        "eps",
        "perf_func",
        "pp_valid",
    ]

    def __init__(
        self,
        classifier: "CLASSIFIER_TYPE",
        x_train: np.ndarray,
        y_train: np.ndarray,
        p_train: np.ndarray,
        x_val: Optional[np.ndarray] = None,
        y_val: Optional[np.ndarray] = None,
        eps: float = 0.2,
        perf_func: str = "accuracy",
        pp_valid: float = 0.2,
    ) -> None:
        """
        Create an :class:`.ProvenanceDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :param x_train: dataset used to train the classifier.
        :param y_train: labels used to train the classifier.
        :param p_train: provenance features for each training data point as one hot vectors.
        :param x_val: Validation data for defense.
        :param y_val: Validation labels for defense.
        :param eps: Threshold for performance shift in suspicious data.
        :param perf_func: performance function used to evaluate effectiveness of defense.
        :param pp_valid: The percent of training data to use as validation data (for defense without validation data).
        """
        super().__init__(classifier, x_train, y_train)
        self.p_train = p_train
        self.num_devices = self.p_train.shape[1]
        self.x_val = x_val
        self.y_val = y_val
        self.eps = eps
        self.perf_func = perf_func
        self.pp_valid = pp_valid
        self.assigned_clean_by_device: List[np.ndarray] = []
        self.is_clean_by_device: List[np.ndarray] = []
        self.errors_by_device: Optional[np.ndarray] = None
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst: Optional[np.ndarray] = None
        self._check_params()

    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        Returns confusion matrix.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")
        self.set_params(**kwargs)

        if not self.assigned_clean_by_device:
            self.detect_poison()

        self.is_clean_by_device = segment_by_class(is_clean, self.p_train,
                                                   self.num_devices)
        self.errors_by_device, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_device, self.is_clean_by_device)
        return conf_matrix_json

    def detect_poison(self, **kwargs) -> Tuple[Dict[int, float], List[int]]:
        """
        Returns poison detected and a report.

        :param kwargs: A dictionary of detection-specific parameters.
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the provenance detection method
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        if self.x_val is None:
            report = self.detect_poison_untrusted()
        else:
            report = self.detect_poison_partially_trusted()

        n_train = len(self.x_train)
        indices_by_provenance = segment_by_class(np.arange(n_train),
                                                 self.p_train,
                                                 self.num_devices)
        self.is_clean_lst = np.array([1] * n_train)

        for device in report:
            self.is_clean_lst[
                indices_by_provenance[device]] = 0  # type: ignore
        self.assigned_clean_by_device = segment_by_class(
            np.array(self.is_clean_lst), self.p_train, self.num_devices)

        return report, self.is_clean_lst  # type: ignore

    def detect_poison_partially_trusted(self, **kwargs) -> Dict[int, float]:
        """
        Detect poison given trusted validation data

        :return: dictionary where keys are suspected poisonous device indices and values are performance differences
        """
        self.set_params(**kwargs)

        if self.x_val is None or self.y_val is None:
            raise ValueError("Trusted data unavailable.")

        suspected = {}
        unfiltered_data = np.copy(self.x_train)
        unfiltered_labels = np.copy(self.y_train)

        segments = segment_by_class(self.x_train, self.p_train,
                                    self.num_devices)
        for device_idx, segment in enumerate(segments):
            filtered_data, filtered_labels = self.filter_input(
                unfiltered_data, unfiltered_labels, segment)

            unfiltered_model = deepcopy(self.classifier)
            filtered_model = deepcopy(self.classifier)

            unfiltered_model.fit(unfiltered_data, unfiltered_labels)
            filtered_model.fit(filtered_data, filtered_labels)

            var_w = performance_diff(
                filtered_model,
                unfiltered_model,
                self.x_val,
                self.y_val,
                perf_function=self.perf_func,
            )
            if self.eps < var_w:
                suspected[device_idx] = var_w
                unfiltered_data = filtered_data
                unfiltered_labels = filtered_labels

        return suspected

    def detect_poison_untrusted(self, **kwargs) -> Dict[int, float]:
        """
        Detect poison given no trusted validation data

        :return: dictionary where keys are suspected poisonous device indices and values are performance differences
        """
        self.set_params(**kwargs)

        suspected = {}
        (
            train_data,
            valid_data,
            train_labels,
            valid_labels,
            train_prov,
            valid_prov,
        ) = train_test_split(self.x_train,
                             self.y_train,
                             self.p_train,
                             test_size=self.pp_valid)

        train_segments = segment_by_class(train_data, train_prov,
                                          self.num_devices)
        valid_segments = segment_by_class(valid_data, valid_prov,
                                          self.num_devices)

        for device_idx, (train_segment, valid_segment) in enumerate(
                zip(train_segments, valid_segments)):
            filtered_data, filtered_labels = self.filter_input(
                train_data, train_labels, train_segment)

            unfiltered_model = deepcopy(self.classifier)
            filtered_model = deepcopy(self.classifier)

            unfiltered_model.fit(train_data, train_labels)
            filtered_model.fit(filtered_data, filtered_labels)

            valid_non_device_data, valid_non_device_labels = self.filter_input(
                valid_data, valid_labels, valid_segment)
            var_w = performance_diff(
                filtered_model,
                unfiltered_model,
                valid_non_device_data,
                valid_non_device_labels,
                perf_function=self.perf_func,
            )

            if self.eps < var_w:
                suspected[device_idx] = var_w
                train_data = filtered_data
                train_labels = filtered_labels
                valid_data = valid_non_device_data
                valid_labels = valid_non_device_labels

        return suspected

    @staticmethod
    def filter_input(data: np.ndarray, labels: np.ndarray,
                     segment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Return the data and labels that are not part of a specified segment

        :param data: The data to segment.
        :param labels: The corresponding labels to segment
        :param segment:
        :return: Tuple of (filtered_data, filtered_labels).
        """
        filter_mask = np.array([
            np.isin(data[i, :], segment, invert=True).any()
            for i in range(data.shape[0])
        ])
        filtered_data = data[filter_mask]
        filtered_labels = labels[filter_mask]

        return filtered_data, filtered_labels

    def _check_params(self) -> None:
        if self.eps < 0:
            raise ValueError("Value of epsilon must be at least 0.")

        if self.pp_valid < 0:
            raise ValueError("Value of pp_valid must be at least 0.")

        if len(self.x_train) != len(self.y_train):
            raise ValueError("x_train and y_train do not match in shape.")

        if len(self.x_train) != len(self.p_train):
            raise ValueError("Provenance features do not match data.")