class SpectralSignatureDefense(PoisonFilteringDefence): """ Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures """ defence_params = PoisonFilteringDefence.defence_params + [ "x_train", "y_train", "batch_size", "eps_multiplier", "ub_pct_poison", "nb_classes", ] def __init__( self, classifier: "CLASSIFIER_NEURALNETWORK_TYPE", x_train: np.ndarray, y_train: np.ndarray, batch_size: int, eps_multiplier: float, ub_pct_poison, nb_classes: int, ) -> None: """ Create an :class:`.SpectralSignatureDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: Dataset used to train the classifier. :param y_train: Labels used to train the classifier. :param batch_size: Size of batches. :param eps_multiplier: :param ub_pct_poison: :param nb_classes: Number of classes. """ super().__init__(classifier, x_train, y_train) self.batch_size = batch_size self.eps_multiplier = eps_multiplier self.ub_pct_poison = ub_pct_poison self.nb_classes = nb_classes self.y_train_sparse = np.argmax(y_train, axis=1) self.evaluator = GroundTruthEvaluator() self._check_params() def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ If ground truth is known, this function returns a confusion matrix in the form of a JSON object. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") is_clean_by_class = SpectralSignatureDefense.split_by_class( is_clean, self.y_train_sparse, self.nb_classes) _, predicted_clean = self.detect_poison() predicted_clean_by_class = SpectralSignatureDefense.split_by_class( predicted_clean, self.y_train_sparse, self.nb_classes) _, conf_matrix_json = self.evaluator.analyze_correctness( predicted_clean_by_class, is_clean_by_class) return conf_matrix_json def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]: """ Returns poison detected and a report. :return: (report, is_clean_lst): where a report is a dictionary containing the index as keys the outlier score of suspected poisons as values where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ self.set_params(**kwargs) nb_layers = len(self.classifier.layer_names) features_x_poisoned = self.classifier.get_activations( self.x_train, layer=nb_layers - 1, batch_size=self.batch_size) features_split = SpectralSignatureDefense.split_by_class( features_x_poisoned, self.y_train_sparse, self.nb_classes) score_by_class, keep_by_class = [], [] for idx, feature in enumerate(features_split): score = SpectralSignatureDefense.spectral_signature_scores(feature) score_cutoff = np.quantile( score, max(1 - self.eps_multiplier * self.ub_pct_poison, 0.0)) score_by_class.append(score) keep_by_class.append(score < score_cutoff) base_indices_by_class = SpectralSignatureDefense.split_by_class( np.arange(self.y_train_sparse.shape[0]), self.y_train_sparse, self.nb_classes, ) is_clean_lst = np.zeros_like(self.y_train_sparse, dtype=np.int) report = {} for keep_booleans, all_scores, indices in zip(keep_by_class, score_by_class, base_indices_by_class): for keep_boolean, all_score, idx in zip(keep_booleans, all_scores, indices): if keep_boolean: is_clean_lst[idx] = 1 else: report[idx] = all_score[0] return report, is_clean_lst @staticmethod def spectral_signature_scores(matrix_r: np.ndarray) -> np.ndarray: """ :param matrix_r: Matrix of feature representations. :return: Outlier scores for each observation based on spectral signature. """ matrix_m = matrix_r - np.mean(matrix_r, axis=0) # Following Algorithm #1 in paper, use SVD of centered features, not of covariance _, _, matrix_v = np.linalg.svd(matrix_m, full_matrices=False) eigs = matrix_v[:1] score = np.matmul(matrix_m, np.transpose(eigs))**2 return score @staticmethod def split_by_class(data: np.ndarray, labels: np.ndarray, num_classes: int) -> List[np.ndarray]: """ :param data: Features. :param labels: Labels, not in one-hot representations. :param num_classes: Number of classes of labels. :return: List of numpy arrays of features split by labels. """ split: List[List[int]] = [[] for _ in range(num_classes)] for idx, label in enumerate(labels): split[int(label)].append(data[idx]) return [np.asarray(dat) for dat in split] def _check_params(self) -> None: if self.batch_size < 0: raise ValueError( "Batch size must be positive integer. Unsupported batch size: " + str(self.batch_size)) if self.eps_multiplier < 0: raise ValueError( "eps_multiplier must be positive. Unsupported value: " + str(self.eps_multiplier)) if self.ub_pct_poison < 0 or self.ub_pct_poison > 1: raise ValueError( "ub_pct_poison must be between 0 and 1. Unsupported value: " + str(self.ub_pct_poison))
class RONIDefense(PoisonFilteringDefence): """ Close implementation based on description in Nelson 'Behavior of Machine Learning Algorithms in Adversarial Environments' Ch. 4.4 | Textbook link: https://people.eecs.berkeley.edu/~adj/publications/paper-files/EECS-2010-140.pdf """ defence_params = [ "classifier", "x_train", "y_train", "x_val", "y_val", "perf_func", "calibrated", "eps", ] def __init__( self, classifier: "CLASSIFIER_TYPE", x_train: np.ndarray, y_train: np.ndarray, x_val: np.ndarray, y_val: np.ndarray, perf_func: Union[str, Callable] = "accuracy", pp_cal: float = 0.2, pp_quiz: float = 0.2, calibrated: bool = True, eps: float = 0.1, ): """ Create an :class:`.RONIDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: Dataset used to train the classifier. :param y_train: Labels used to train the classifier. :param x_val: Trusted data points. :param y_train: Trusted data labels. :param perf_func: Performance function to use. :param pp_cal: Percent of training data used for calibration. :param pp_quiz: Percent of training data used for quiz set. :param calibrated: True if using the calibrated form of RONI. :param eps: performance threshold if using uncalibrated RONI. """ super().__init__(classifier, x_train, y_train) n_points = len(x_train) quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points)) self.calibrated = calibrated self.x_quiz = np.copy(self.x_train[quiz_idx]) self.y_quiz = np.copy(self.y_train[quiz_idx]) if self.calibrated: _, self.x_cal, _, self.y_cal = train_test_split(self.x_train, self.y_train, test_size=pp_cal, shuffle=True) self.eps = eps self.evaluator = GroundTruthEvaluator() self.x_val = x_val self.y_val = y_val self.perf_func = perf_func self.is_clean_lst: List[int] = [] self._check_params() def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ self.set_params(**kwargs) if len(self.is_clean_lst) == 0: self.detect_poison() if is_clean is None or len(is_clean) != len(self.is_clean_lst): raise ValueError("Invalid value for is_clean.") _, conf_matrix = self.evaluator.analyze_correctness( [self.is_clean_lst], [is_clean]) # type: ignore return conf_matrix def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]: """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the provenance detection method where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ self.set_params(**kwargs) x_suspect = self.x_train y_suspect = self.y_train x_trusted = self.x_val y_trusted = self.y_val self.is_clean_lst = [1 for _ in range(len(x_suspect))] report = {} before_classifier = deepcopy(self.classifier) before_classifier.fit(x_suspect, y_suspect) for idx in np.random.permutation(len(x_suspect)): x_i = x_suspect[idx] y_i = y_suspect[idx] after_classifier = deepcopy(before_classifier) after_classifier.fit(x=np.vstack([x_trusted, x_i]), y=np.vstack([y_trusted, y_i])) acc_shift = performance_diff( before_classifier, after_classifier, self.x_quiz, self.y_quiz, perf_function=self.perf_func, ) # print(acc_shift, median, std_dev) if self.is_suspicious(before_classifier, acc_shift): self.is_clean_lst[idx] = 0 report[idx] = acc_shift else: before_classifier = after_classifier x_trusted = np.vstack([x_trusted, x_i]) y_trusted = np.vstack([y_trusted, y_i]) return report, self.is_clean_lst def is_suspicious(self, before_classifier: "CLASSIFIER_TYPE", perf_shift: float) -> bool: """ Returns True if a given performance shift is suspicious :param before_classifier: The classifier without untrusted data. :param perf_shift: A shift in performance. :return: True if a given performance shift is suspicious, false otherwise. """ if self.calibrated: median, std_dev = self.get_calibration_info(before_classifier) return perf_shift < median - 3 * std_dev return perf_shift < -self.eps def get_calibration_info( self, before_classifier: "CLASSIFIER_TYPE" ) -> Tuple[np.ndarray, np.ndarray]: """ Calculate the median and standard deviation of the accuracy shifts caused by the calibration set. :param before_classifier: The classifier trained without suspicious point. :return: A tuple consisting of `(median, std_dev)`. """ accs = [] for x_c, y_c in zip(self.x_cal, self.y_cal): after_classifier = deepcopy(before_classifier) after_classifier.fit(x=np.vstack([self.x_val, x_c]), y=np.vstack([self.y_val, y_c])) accs.append( performance_diff( before_classifier, after_classifier, self.x_quiz, self.y_quiz, perf_function=self.perf_func, )) return np.median(accs), np.std(accs) def _check_params(self) -> None: if len(self.x_train) != len(self.y_train): raise ValueError("`x_train` and `y_train` do not match shape.") if self.eps < 0: raise ValueError("Value of `eps` must be at least 0.")
class ActivationDefence(PoisonFilteringDefence): """ Method from Chen et al., 2018 performing poisoning detection based on activations clustering. | Paper link: https://arxiv.org/abs/1811.03728 | Please keep in mind the limitations of defences. For more information on the limitations of this defence, see https://arxiv.org/abs/1905.13409 . For details on how to evaluate classifier security in general, see https://arxiv.org/abs/1902.06705 """ defence_params = ["nb_clusters", "clustering_method", "nb_dims", "reduce", "cluster_analysis", "generator"] valid_clustering = ["KMeans"] valid_reduce = ["PCA", "FastICA", "TSNE"] valid_analysis = ["smaller", "distance", "relative-size", "silhouette-scores"] TOO_SMALL_ACTIVATIONS = 32 # Threshold used to print a warning when activations are not enough def __init__( self, classifier: "CLASSIFIER_NEURALNETWORK_TYPE", x_train: Optional[np.ndarray], y_train: Optional[np.ndarray], generator: Optional[DataGenerator] = None, ) -> None: """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: A dataset used to train the classifier. :param y_train: Labels used to train the classifier. :param generator: A data generator to be used instead of `x_train` and `y_train`. """ super().__init__(classifier, x_train, y_train) self.nb_clusters = 2 self.clustering_method = "KMeans" self.nb_dims = 10 self.reduce = "PCA" self.cluster_analysis = "smaller" self.generator = generator self.activations_by_class: List[np.ndarray] = [] self.clusters_by_class: List[np.ndarray] = [] self.assigned_clean_by_class: List[np.ndarray] = [] self.is_clean_by_class: List[np.ndarray] = [] self.errors_by_class: List[np.ndarray] = [] self.red_activations_by_class: List[np.ndarray] = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst: List[int] = [] self.confidence_level: List[float] = [] self.poisonous_clusters: List[List[np.ndarray]] = [] self.clusterer = MiniBatchKMeans(n_clusters=self.nb_clusters) self._check_params() def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ If ground truth is known, this function returns a confusion matrix in the form of a JSON object. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ if is_clean is None or is_clean.size == 0: raise ValueError("is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.activations_by_class and self.generator is None: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) (self.clusters_by_class, self.red_activations_by_class,) = self.cluster_activations() _, self.assigned_clean_by_class = self.analyze_clusters() # Now check ground truth: if self.generator is not None: batch_size = self.generator.batch_size num_samples = self.generator.size num_classes = self.classifier.nb_classes self.is_clean_by_class = [np.empty(0, dtype=int) for _ in range(num_classes)] # calculate is_clean_by_class for each batch for batch_idx in range(num_samples // batch_size): # type: ignore _, y_batch = self.generator.get_batch() is_clean_batch = is_clean[batch_idx * batch_size : batch_idx * batch_size + batch_size] clean_by_class_batch = self._segment_by_class(is_clean_batch, y_batch) self.is_clean_by_class = [ np.append(self.is_clean_by_class[class_idx], clean_by_class_batch[class_idx]) for class_idx in range(num_classes) ] else: self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train) self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_class, self.is_clean_by_class ) return conf_matrix_json # pylint: disable=W0221 def detect_poison(self, **kwargs) -> Tuple[Dict[str, Any], List[int]]: """ Returns poison detected and a report. :param clustering_method: clustering algorithm to be used. Currently `KMeans` is the only method supported :type clustering_method: `str` :param nb_clusters: number of clusters to find. This value needs to be greater or equal to one :type nb_clusters: `int` :param reduce: method used to reduce dimensionality of the activations. Supported methods include `PCA`, `FastICA` and `TSNE` :type reduce: `str` :param nb_dims: number of dimensions to be reduced :type nb_dims: `int` :param cluster_analysis: heuristic to automatically determine if a cluster contains poisonous data. Supported methods include `smaller` and `distance`. The `smaller` method defines as poisonous the cluster with less number of data points, while the `distance` heuristic uses the distance between the clusters. :type cluster_analysis: `str` :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the clustering analysis technique where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ old_nb_clusters = self.nb_clusters self.set_params(**kwargs) if self.nb_clusters != old_nb_clusters: self.clusterer = MiniBatchKMeans(n_clusters=self.nb_clusters) if self.generator is not None: self.clusters_by_class, self.red_activations_by_class = self.cluster_activations() report, self.assigned_clean_by_class = self.analyze_clusters() batch_size = self.generator.batch_size num_samples = self.generator.size self.is_clean_lst = [] # loop though the generator to generator a report for _ in range(num_samples // batch_size): # type: ignore _, y_batch = self.generator.get_batch() indices_by_class = self._segment_by_class(np.arange(batch_size), y_batch) is_clean_lst = [0] * batch_size for class_idx, idxs in enumerate(indices_by_class): for idx_in_class, idx in enumerate(idxs): is_clean_lst[idx] = self.assigned_clean_by_class[class_idx][idx_in_class] self.is_clean_lst += is_clean_lst return report, self.is_clean_lst if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) (self.clusters_by_class, self.red_activations_by_class,) = self.cluster_activations() report, self.assigned_clean_by_class = self.analyze_clusters() # Here, assigned_clean_by_class[i][j] is 1 if the jth data point in the ith class was # determined to be clean by activation cluster # Build an array that matches the original indexes of x_train n_train = len(self.x_train) indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train) self.is_clean_lst = [0] * n_train for assigned_clean, indices_dp in zip(self.assigned_clean_by_class, indices_by_class): for assignment, index_dp in zip(assigned_clean, indices_dp): if assignment == 1: self.is_clean_lst[index_dp] = 1 return report, self.is_clean_lst def cluster_activations(self, **kwargs) -> Tuple[List[List[int]], List[List[int]]]: """ Clusters activations and returns cluster_by_class and red_activations_by_class, where cluster_by_class[i][j] is the cluster to which the j-th data point in the ith class belongs and the correspondent activations reduced by class red_activations_by_class[i][j]. :param kwargs: A dictionary of cluster-specific parameters. :return: Clusters per class and activations by class. """ self.set_params(**kwargs) if self.generator is not None: batch_size = self.generator.batch_size num_samples = self.generator.size num_classes = self.classifier.nb_classes for batch_idx in range(num_samples // batch_size): # type: ignore x_batch, y_batch = self.generator.get_batch() batch_activations = self._get_activations(x_batch) activation_dim = batch_activations.shape[-1] # initialize values list of lists on first run if batch_idx == 0: self.activations_by_class = [np.empty((0, activation_dim)) for _ in range(num_classes)] self.clusters_by_class = [np.empty(0, dtype=int) for _ in range(num_classes)] self.red_activations_by_class = [np.empty((0, self.nb_dims)) for _ in range(num_classes)] activations_by_class = self._segment_by_class(batch_activations, y_batch) clusters_by_class, red_activations_by_class = cluster_activations( activations_by_class, nb_clusters=self.nb_clusters, nb_dims=self.nb_dims, reduce=self.reduce, clustering_method=self.clustering_method, generator=self.generator, clusterer_new=self.clusterer, ) for class_idx in range(num_classes): self.activations_by_class[class_idx] = np.vstack( [self.activations_by_class[class_idx], activations_by_class[class_idx]] ) self.clusters_by_class[class_idx] = np.append( self.clusters_by_class[class_idx], clusters_by_class[class_idx] ) self.red_activations_by_class[class_idx] = np.vstack( [self.red_activations_by_class[class_idx], red_activations_by_class[class_idx]] ) return self.clusters_by_class, self.red_activations_by_class if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) [self.clusters_by_class, self.red_activations_by_class] = cluster_activations( self.activations_by_class, nb_clusters=self.nb_clusters, nb_dims=self.nb_dims, reduce=self.reduce, clustering_method=self.clustering_method, ) return self.clusters_by_class, self.red_activations_by_class def analyze_clusters(self, **kwargs) -> Tuple[Dict[str, Any], np.ndarray]: """ This function analyzes the clusters according to the provided method. :param kwargs: A dictionary of cluster-analysis-specific parameters. :return: (report, assigned_clean_by_class), where the report is a dict object and assigned_clean_by_class is an array of arrays that contains what data points where classified as clean. """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() analyzer = ClusteringAnalyzer() if self.cluster_analysis == "smaller": (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_size( self.clusters_by_class ) elif self.cluster_analysis == "relative-size": (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_relative_size( self.clusters_by_class ) elif self.cluster_analysis == "distance": (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_distance( self.clusters_by_class, separated_activations=self.red_activations_by_class, ) elif self.cluster_analysis == "silhouette-scores": (self.assigned_clean_by_class, self.poisonous_clusters, report,) = analyzer.analyze_by_silhouette_score( self.clusters_by_class, reduced_activations_by_class=self.red_activations_by_class, ) else: raise ValueError("Unsupported cluster analysis technique " + self.cluster_analysis) # Add to the report current parameters used to run the defence and the analysis summary report = dict(list(report.items()) + list(self.get_params().items())) return report, self.assigned_clean_by_class @staticmethod def relabel_poison_ground_truth( classifier: "CLASSIFIER_NEURALNETWORK_TYPE", x: np.ndarray, y_fix: np.ndarray, test_set_split: float = 0.7, tolerable_backdoor: float = 0.01, max_epochs: int = 50, batch_epochs: int = 10, ) -> Tuple[float, "CLASSIFIER_NEURALNETWORK_TYPE"]: """ Revert poison attack by continue training the current classifier with `x`, `y_fix`. `test_set_split` determines the percentage in x that will be used as training set, while `1-test_set_split` determines how many data points to use for test set. :param classifier: Classifier to be fixed. :param x: Samples. :param y_fix: True label of `x_poison`. :param test_set_split: this parameter determine how much data goes to the training set. Here `test_set_split*len(y_fix)` determines the number of data points in `x_train` and `(1-test_set_split) * len(y_fix)` the number of data points in `x_test`. :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate. :param max_epochs: Maximum number of epochs that the model will be trained. :param batch_epochs: Number of epochs to be trained before checking current state of model. :return: (improve_factor, classifier). """ # Split data into testing and training: n_train = int(len(x) * test_set_split) x_train, x_test = x[:n_train], x[n_train:] y_train, y_test = y_fix[:n_train], y_fix[n_train:] filename = "original_classifier" + str(time.time()) + ".p" ActivationDefence._pickle_classifier(classifier, filename) # Now train using y_fix: improve_factor, _ = train_remove_backdoor( classifier, x_train, y_train, x_test, y_test, tolerable_backdoor=tolerable_backdoor, max_epochs=max_epochs, batch_epochs=batch_epochs, ) # Only update classifier if there was an improvement: if improve_factor < 0: classifier = ActivationDefence._unpickle_classifier(filename) return 0, classifier ActivationDefence._remove_pickle(filename) return improve_factor, classifier @staticmethod def relabel_poison_cross_validation( classifier: "CLASSIFIER_NEURALNETWORK_TYPE", x: np.ndarray, y_fix: np.ndarray, n_splits: int = 10, tolerable_backdoor: float = 0.01, max_epochs: int = 50, batch_epochs: int = 10, ) -> Tuple[float, "CLASSIFIER_NEURALNETWORK_TYPE"]: """ Revert poison attack by continue training the current classifier with `x`, `y_fix`. `n_splits` determines the number of cross validation splits. :param classifier: Classifier to be fixed. :param x: Samples that were miss-labeled. :param y_fix: True label of `x`. :param n_splits: Determines how many splits to use in cross validation (only used if `cross_validation=True`). :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate. :param max_epochs: Maximum number of epochs that the model will be trained. :param batch_epochs: Number of epochs to be trained before checking current state of model. :return: (improve_factor, classifier) """ # pylint: disable=E0001 from sklearn.model_selection import KFold # Train using cross validation k_fold = KFold(n_splits=n_splits) KFold(n_splits=n_splits, random_state=None, shuffle=True) filename = "original_classifier" + str(time.time()) + ".p" ActivationDefence._pickle_classifier(classifier, filename) curr_improvement = 0 for train_index, test_index in k_fold.split(x): # Obtain partition: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y_fix[train_index], y_fix[test_index] # Unpickle original model: curr_classifier = ActivationDefence._unpickle_classifier(filename) new_improvement, fixed_classifier = train_remove_backdoor( curr_classifier, x_train, y_train, x_test, y_test, tolerable_backdoor=tolerable_backdoor, max_epochs=max_epochs, batch_epochs=batch_epochs, ) if curr_improvement < new_improvement and new_improvement > 0: curr_improvement = new_improvement classifier = fixed_classifier logger.info("Selected as best model so far: %s", curr_improvement) ActivationDefence._remove_pickle(filename) return curr_improvement, classifier @staticmethod def _pickle_classifier(classifier: "CLASSIFIER_NEURALNETWORK_TYPE", file_name: str) -> None: """ Pickles the self.classifier and stores it using the provided file_name in folder `art.ART_DATA_PATH`. :param classifier: Classifier to be pickled. :param file_name: Name of the file where the classifier will be pickled. """ full_path = os.path.join(ART_DATA_PATH, file_name) folder = os.path.split(full_path)[0] if not os.path.exists(folder): os.makedirs(folder) with open(full_path, "wb") as f_classifier: pickle.dump(classifier, f_classifier) @staticmethod def _unpickle_classifier(file_name: str) -> "CLASSIFIER_NEURALNETWORK_TYPE": """ Unpickles classifier using the filename provided. Function assumes that the pickle is in `art.ART_DATA_PATH`. :param file_name: Path of the pickled classifier relative to `ART_DATA_PATH`. :return: The loaded classifier. """ full_path = os.path.join(ART_DATA_PATH, file_name) logger.info("Loading classifier from %s", full_path) with open(full_path, "rb") as f_classifier: loaded_classifier = pickle.load(f_classifier) return loaded_classifier @staticmethod def _remove_pickle(file_name: str) -> None: """ Erases the pickle with the provided file name. :param file_name: File name without directory. """ full_path = os.path.join(ART_DATA_PATH, file_name) os.remove(full_path) def visualize_clusters( self, x_raw: np.ndarray, save: bool = True, folder: str = ".", **kwargs ) -> List[List[List[np.ndarray]]]: """ This function creates the sprite/mosaic visualization for clusters. When save=True, it also stores a sprite (mosaic) per cluster in ART_DATA_PATH. :param x_raw: Images used to train the classifier (before pre-processing). :param save: Boolean specifying if image should be saved. :param folder: Directory where the sprites will be saved inside ART_DATA_PATH folder. :param kwargs: a dictionary of cluster-analysis-specific parameters. :return: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the sprite of class i cluster j. """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() x_raw_by_class = self._segment_by_class(x_raw, self.y_train) x_raw_by_cluster: List[List[List[np.ndarray]]] = [ [[] for _ in range(self.nb_clusters)] for _ in range(self.classifier.nb_classes) ] # Get all data in x_raw in the right cluster for n_class, cluster in enumerate(self.clusters_by_class): for j, assigned_cluster in enumerate(cluster): x_raw_by_cluster[n_class][assigned_cluster].append(x_raw_by_class[n_class][j]) # Now create sprites: sprites_by_class: List[List[List[np.ndarray]]] = [ [[] for _ in range(self.nb_clusters)] for _ in range(self.classifier.nb_classes) ] for i, class_i in enumerate(x_raw_by_cluster): for j, images_cluster in enumerate(class_i): title = "Class_" + str(i) + "_cluster_" + str(j) + "_clusterSize_" + str(len(images_cluster)) f_name = title + ".png" f_name = os.path.join(folder, f_name) sprite = create_sprite(np.array(images_cluster)) if save: save_image(sprite, f_name) sprites_by_class[i][j] = sprite return sprites_by_class def plot_clusters(self, save: bool = True, folder: str = ".", **kwargs) -> None: """ Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot. When save=True, it also stores the 3D-plot per cluster in ART_DATA_PATH. :param save: Boolean specifying if image should be saved. :param folder: Directory where the sprites will be saved inside ART_DATA_PATH folder. :param kwargs: a dictionary of cluster-analysis-specific parameters. """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() # Get activations reduced to 3-components: separated_reduced_activations = [] for activation in self.activations_by_class: reduced_activations = reduce_dimensionality(activation, nb_dims=3) separated_reduced_activations.append(reduced_activations) # For each class generate a plot: for class_id, (labels, coordinates) in enumerate(zip(self.clusters_by_class, separated_reduced_activations)): f_name = "" if save: f_name = os.path.join(folder, "plot_class_" + str(class_id) + ".png") plot_3d(coordinates, labels, save=save, f_name=f_name) def _check_params(self): if self.nb_clusters <= 1: raise ValueError( "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.nb_clusters) ) if self.nb_dims <= 0: raise ValueError("Wrong number of dimensions.") if self.clustering_method not in self.valid_clustering: raise ValueError("Unsupported clustering method: " + self.clustering_method) if self.reduce not in self.valid_reduce: raise ValueError("Unsupported reduction method: " + self.reduce) if self.cluster_analysis not in self.valid_analysis: raise ValueError("Unsupported method for cluster analysis method: " + self.cluster_analysis) if self.generator and not isinstance(self.generator, DataGenerator): raise TypeError("Generator must a an instance of DataGenerator") def _get_activations(self, x_train: Optional[np.ndarray] = None) -> np.ndarray: """ Find activations from :class:`.Classifier`. """ logger.info("Getting activations") nb_layers = len(self.classifier.layer_names) protected_layer = nb_layers - 1 if self.generator is not None: activations = self.classifier.get_activations( x_train, layer=protected_layer, batch_size=self.generator.batch_size ) else: activations = self.classifier.get_activations(self.x_train, layer=protected_layer, batch_size=128) # wrong way to get activations activations = self.classifier.predict(self.x_train) nodes_last_layer = np.shape(activations)[1] if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS: logger.warning( "Number of activations in last hidden layer is too small. Method may not work properly. " "Size: %s", str(nodes_last_layer), ) return activations def _segment_by_class(self, data: np.ndarray, features: np.ndarray) -> List[np.ndarray]: """ Returns segmented data according to specified features. :param data: Data to be segmented. :param features: Features used to segment data, e.g., segment according to predicted label or to `y_train`. :return: Segmented data according to specified features. """ n_classes = self.classifier.nb_classes return segment_by_class(data, features, n_classes)
class SpectralSignatureDefense(PoisonFilteringDefence): """ Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures """ defence_params = PoisonFilteringDefence.defence_params + [ "x_train", "y_train", "batch_size", "eps_multiplier", "expected_pp_poison", ] def __init__( self, classifier: "CLASSIFIER_NEURALNETWORK_TYPE", x_train: np.ndarray, y_train: np.ndarray, expected_pp_poison: float = 0.33, batch_size: int = 128, eps_multiplier: float = 1.5, ) -> None: """ Create an :class:`.SpectralSignatureDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: Dataset used to train the classifier. :param y_train: Labels used to train the classifier. :param expected_pp_poison: The expected percentage of poison in the dataset :param batch_size: The batch size for predictions :param eps_multiplier: The multiplier to add to the previous expectation. Numbers higher than one represent a potentially higher false positive rate, but may detect more poison samples """ super().__init__(classifier, x_train, y_train) self.classifier: "CLASSIFIER_NEURALNETWORK_TYPE" = classifier self.batch_size = batch_size self.eps_multiplier = eps_multiplier self.expected_pp_poison = expected_pp_poison self.y_train = y_train self.evaluator = GroundTruthEvaluator() self._check_params() def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ If ground truth is known, this function returns a confusion matrix in the form of a JSON object. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") is_clean_by_class = segment_by_class(is_clean, self.y_train, self.classifier.nb_classes) _, predicted_clean = self.detect_poison() predicted_clean_by_class = segment_by_class(predicted_clean, self.y_train, self.classifier.nb_classes) _, conf_matrix_json = self.evaluator.analyze_correctness( predicted_clean_by_class, is_clean_by_class) return conf_matrix_json def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]: """ Returns poison detected and a report. :return: (report, is_clean_lst): where a report is a dictionary containing the index as keys the outlier score of suspected poisons as values where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ self.set_params(**kwargs) if self.classifier.layer_names is not None: nb_layers = len(self.classifier.layer_names) else: raise ValueError("No layer names identified.") features_x_poisoned = self.classifier.get_activations( self.x_train, layer=nb_layers - 1, batch_size=self.batch_size) if not isinstance(features_x_poisoned, np.ndarray): raise ValueError("Wrong type detected.") if features_x_poisoned is not None: features_split = segment_by_class(features_x_poisoned, self.y_train, self.classifier.nb_classes) else: raise ValueError("Activation are `None`.") score_by_class = [] keep_by_class = [] for idx, feature in enumerate(features_split): # Check for empty list if len(feature): # pylint: disable=C1801 score = SpectralSignatureDefense.spectral_signature_scores( np.vstack(feature)) # type: ignore score_cutoff = np.quantile( score, max(1 - self.eps_multiplier * self.expected_pp_poison, 0.0)) score_by_class.append(score) keep_by_class.append(score < score_cutoff) else: score_by_class.append([0]) # type: ignore keep_by_class.append([True]) base_indices_by_class = segment_by_class( np.arange(self.y_train.shape[0]), self.y_train, self.classifier.nb_classes, ) is_clean_lst = [0] * self.y_train.shape[0] report = {} for keep_booleans, all_scores, indices in zip(keep_by_class, score_by_class, base_indices_by_class): for keep_boolean, all_score, idx in zip(keep_booleans, all_scores, indices): if keep_boolean: is_clean_lst[idx] = 1 else: report[idx] = all_score[0] return report, is_clean_lst def _check_params(self) -> None: if self.batch_size < 0: raise ValueError( "Batch size must be positive integer. Unsupported batch size: " + str(self.batch_size)) if self.eps_multiplier < 0: raise ValueError( "eps_multiplier must be positive. Unsupported value: " + str(self.eps_multiplier)) if self.expected_pp_poison < 0 or self.expected_pp_poison > 1: raise ValueError( "expected_pp_poison must be between 0 and 1. Unsupported value: " + str(self.expected_pp_poison)) @staticmethod def spectral_signature_scores(matrix_r: np.ndarray) -> np.ndarray: """ :param matrix_r: Matrix of feature representations. :return: Outlier scores for each observation based on spectral signature. """ matrix_m = matrix_r - np.mean(matrix_r, axis=0) # Following Algorithm #1 in paper, use SVD of centered features, not of covariance _, _, matrix_v = np.linalg.svd(matrix_m, full_matrices=False) eigs = matrix_v[:1] corrs = np.matmul(eigs, np.transpose(matrix_r)) score = np.expand_dims(np.linalg.norm(corrs, axis=0), axis=1) return score
class ProvenanceDefense(PoisonFilteringDefence): """ Implements methods performing poisoning detection based on data provenance. | Paper link: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8473440 """ defence_params = [ "classifier", "x_train", "y_train", "p_train", "x_val", "y_val", "eps", "perf_func", "pp_valid", ] def __init__( self, classifier: "CLASSIFIER_TYPE", x_train: np.ndarray, y_train: np.ndarray, p_train: np.ndarray, x_val: Optional[np.ndarray] = None, y_val: Optional[np.ndarray] = None, eps: float = 0.2, perf_func: str = "accuracy", pp_valid: float = 0.2, ) -> None: """ Create an :class:`.ProvenanceDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: dataset used to train the classifier. :param y_train: labels used to train the classifier. :param p_train: provenance features for each training data point as one hot vectors. :param x_val: Validation data for defense. :param y_val: Validation labels for defense. :param eps: Threshold for performance shift in suspicious data. :param perf_func: performance function used to evaluate effectiveness of defense. :param pp_valid: The percent of training data to use as validation data (for defense without validation data). """ super().__init__(classifier, x_train, y_train) self.p_train = p_train self.num_devices = self.p_train.shape[1] self.x_val = x_val self.y_val = y_val self.eps = eps self.perf_func = perf_func self.pp_valid = pp_valid self.assigned_clean_by_device: List[np.ndarray] = [] self.is_clean_by_device: List[np.ndarray] = [] self.errors_by_device: Optional[np.ndarray] = None self.evaluator = GroundTruthEvaluator() self.is_clean_lst: Optional[np.ndarray] = None self._check_params() def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.assigned_clean_by_device: self.detect_poison() self.is_clean_by_device = segment_by_class(is_clean, self.p_train, self.num_devices) self.errors_by_device, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_device, self.is_clean_by_device) return conf_matrix_json def detect_poison(self, **kwargs) -> Tuple[Dict[int, float], List[int]]: """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the provenance detection method where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. :rtype: `tuple` """ self.set_params(**kwargs) if self.x_val is None: report = self.detect_poison_untrusted() else: report = self.detect_poison_partially_trusted() n_train = len(self.x_train) indices_by_provenance = segment_by_class(np.arange(n_train), self.p_train, self.num_devices) self.is_clean_lst = np.array([1] * n_train) for device in report: self.is_clean_lst[ indices_by_provenance[device]] = 0 # type: ignore self.assigned_clean_by_device = segment_by_class( np.array(self.is_clean_lst), self.p_train, self.num_devices) return report, self.is_clean_lst # type: ignore def detect_poison_partially_trusted(self, **kwargs) -> Dict[int, float]: """ Detect poison given trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences """ self.set_params(**kwargs) if self.x_val is None or self.y_val is None: raise ValueError("Trusted data unavailable.") suspected = {} unfiltered_data = np.copy(self.x_train) unfiltered_labels = np.copy(self.y_train) segments = segment_by_class(self.x_train, self.p_train, self.num_devices) for device_idx, segment in enumerate(segments): filtered_data, filtered_labels = self.filter_input( unfiltered_data, unfiltered_labels, segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(unfiltered_data, unfiltered_labels) filtered_model.fit(filtered_data, filtered_labels) var_w = performance_diff( filtered_model, unfiltered_model, self.x_val, self.y_val, perf_function=self.perf_func, ) if self.eps < var_w: suspected[device_idx] = var_w unfiltered_data = filtered_data unfiltered_labels = filtered_labels return suspected def detect_poison_untrusted(self, **kwargs) -> Dict[int, float]: """ Detect poison given no trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences """ self.set_params(**kwargs) suspected = {} ( train_data, valid_data, train_labels, valid_labels, train_prov, valid_prov, ) = train_test_split(self.x_train, self.y_train, self.p_train, test_size=self.pp_valid) train_segments = segment_by_class(train_data, train_prov, self.num_devices) valid_segments = segment_by_class(valid_data, valid_prov, self.num_devices) for device_idx, (train_segment, valid_segment) in enumerate( zip(train_segments, valid_segments)): filtered_data, filtered_labels = self.filter_input( train_data, train_labels, train_segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(train_data, train_labels) filtered_model.fit(filtered_data, filtered_labels) valid_non_device_data, valid_non_device_labels = self.filter_input( valid_data, valid_labels, valid_segment) var_w = performance_diff( filtered_model, unfiltered_model, valid_non_device_data, valid_non_device_labels, perf_function=self.perf_func, ) if self.eps < var_w: suspected[device_idx] = var_w train_data = filtered_data train_labels = filtered_labels valid_data = valid_non_device_data valid_labels = valid_non_device_labels return suspected @staticmethod def filter_input(data: np.ndarray, labels: np.ndarray, segment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Return the data and labels that are not part of a specified segment :param data: The data to segment. :param labels: The corresponding labels to segment :param segment: :return: Tuple of (filtered_data, filtered_labels). """ filter_mask = np.array([ np.isin(data[i, :], segment, invert=True).any() for i in range(data.shape[0]) ]) filtered_data = data[filter_mask] filtered_labels = labels[filter_mask] return filtered_data, filtered_labels def _check_params(self) -> None: if self.eps < 0: raise ValueError("Value of epsilon must be at least 0.") if self.pp_valid < 0: raise ValueError("Value of pp_valid must be at least 0.") if len(self.x_train) != len(self.y_train): raise ValueError("x_train and y_train do not match in shape.") if len(self.x_train) != len(self.p_train): raise ValueError("Provenance features do not match data.")