def __init__(self, classifier, x_train, y_train): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: model evaluated for poison :type classifier: :class:`.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` """ super(ActivationDefence, self).__init__(classifier, x_train, y_train) kwargs = {'nb_clusters': 2, 'clustering_method': "KMeans", 'nb_dims': 10, 'reduce': 'PCA', 'cluster_analysis': "smaller"} self.set_params(**kwargs) self.activations_by_class = [] self.clusters_by_class = [] self.assigned_clean_by_class = [] self.is_clean_by_class = [] self.errors_by_class = [] self.red_activations_by_class = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.confidence_level = [] self.poisonous_clusters = []
def __init__(self, classifier, x_train, y_train, verbose=True): """ Create an ActivationDefence object with the provided classifier :param classifier: model evaluated for poison :type classifier: :class:`Classifier` :param x_train: dataset used to train `classifier` :type x_train: :class:`numpy.ndarray` :param y_train: labels used to train `classifier` :type y_train: :class:`numpy.ndarray` :param verbose: When True prints more information :type verbose: `bool` """ super(ActivationDefence, self).__init__(classifier, x_train, y_train, verbose) kwargs = { 'n_clusters': 2, 'clustering_method': "KMeans", 'ndims': 10, 'reduce': 'PCA', 'cluster_analysis': "smaller" } self.set_params(**kwargs) self.activations_by_class = [] self.clusters_by_class = [] self.assigned_clean_by_class = [] self.is_clean_by_class = [] self.errors_by_class = [] self.red_activations_by_class = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.confidence_level = []
def __init__(self, classifier, x_train, y_train, **kwargs): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: dataset used to train the classifier. :param y_train: labels used to train the classifier. """ super(SpectralSignatureDefense, self).__init__(classifier, x_train, y_train) self.set_params(**kwargs) self.evaluator = GroundTruthEvaluator()
def __init__(self, classifier, x_train, y_train, x_val, y_val, perf_func='accuracy', pp_cal=0.2, pp_quiz=0.2, calibrated=True, eps=0.1, **kwargs): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :type classifier: :class:`art.classifiers.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` :param x_val: trusted data points :type x_val: `np.ndarray` :param y_train: trusted data labels :type y_train: `np.ndarray` :param perf_func: performance function to use :type perf_func: `str` or `callable` :param pp_cal: percent of training data used for calibration :type pp_cal: `float` :param pp_quiz: percent of training data used for quiz set :type pp_quiz: `float` :param calibrated: True if using the calibrated form of RONI :type calibrated: `bool` :param eps: performance threshold if using uncalibrated RONI :type eps: `float` """ super(RONIDefense, self).__init__(classifier, x_train, y_train) n_points = len(x_train) quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points)) self.calibrated = calibrated self.x_quiz = np.copy(self.x_train[quiz_idx]) self.y_quiz = np.copy(self.y_train[quiz_idx]) if self.calibrated: _, self.x_cal, _, self.y_cal = train_test_split(self.x_train, self.y_train, test_size=pp_cal, shuffle=True) self.eps = eps self.evaluator = GroundTruthEvaluator() self.x_val = x_val self.y_val = y_val self.perf_func = perf_func self.is_clean_lst = list() self.set_params(**kwargs)
def setUp(self): self.evaluator = GroundTruthEvaluator() self.n_classes = 3 self.n_dp = 10 self.n_dp_mix = 5 self.is_clean_all_clean = [[] for i in range(self.n_classes)] self.is_clean_all_poison = [[] for i in range(self.n_classes)] self.is_clean_mixed = [[] for i in range(self.n_classes)] self.is_clean_comp_mix = [[] for i in range(self.n_classes)] for i in range(self.n_classes): self.is_clean_all_clean[i] = [1] * self.n_dp self.is_clean_all_poison[i] = [0] * self.n_dp self.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0] self.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
def __init__(self, classifier, x_train, y_train, p_train, x_val=None, y_val=None, eps=0.2, perf_func='accuracy', pp_valid=0.2, **kwargs): """ Create an :class:`.ProvenanceDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :type classifier: :class:`art.classifiers.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` :param p_train: provenance features for each training data point as one hot vectors :type p_train: `np.ndarray` :param x_val: validation data for defense (optional) :type x_val: `np.ndarray` :param y_val: validation labels for defense (optional) :type y_val: `np.ndarray` :param eps: threshold for performance shift in suspicious data :type eps: `float` :param perf_func: performance function used to evaluate effectiveness of defense :type eps: `str` or `callable` :param pp_valid: The percent of training data to use as validation data (for defense without validation data) :type eps: `str` or `callable` """ super(ProvenanceDefense, self).__init__(classifier, x_train, y_train) self.p_train = p_train self.num_devices = self.p_train.shape[1] self.x_val = x_val self.y_val = y_val self.eps = eps self.perf_func = perf_func self.pp_valid = pp_valid self.assigned_clean_by_device = [] self.is_clean_by_device = [] self.errors_by_device = [] self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.set_params(**kwargs)
def setUpClass(cls): cls.evaluator = GroundTruthEvaluator() cls.n_classes = 3 cls.n_dp = 10 cls.n_dp_mix = 5 cls.is_clean_all_clean = [[] for _ in range(cls.n_classes)] cls.is_clean_all_poison = [[] for _ in range(cls.n_classes)] cls.is_clean_mixed = [[] for _ in range(cls.n_classes)] cls.is_clean_comp_mix = [[] for _ in range(cls.n_classes)] for i in range(cls.n_classes): cls.is_clean_all_clean[i] = [1] * cls.n_dp cls.is_clean_all_poison[i] = [0] * cls.n_dp cls.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0] cls.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
class ActivationDefence(PoisonFilteringDefence): """ Method from [Chen et al., 2018] performing poisoning detection based on activations clustering. Paper link: https://arxiv.org/abs/1811.03728 """ defence_params = ['nb_clusters', 'clustering_method', 'nb_dims', 'reduce', 'cluster_analysis'] valid_clustering = ['KMeans'] valid_reduce = ['PCA', 'FastICA', 'TSNE'] valid_analysis = ['smaller', 'distance', 'relative-size', 'silhouette-scores'] TOO_SMALL_ACTIVATIONS = 32 # Threshold used to print a warning when activations are not enough def __init__(self, classifier, x_train, y_train): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: model evaluated for poison :type classifier: :class:`.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` """ super(ActivationDefence, self).__init__(classifier, x_train, y_train) kwargs = {'nb_clusters': 2, 'clustering_method': "KMeans", 'nb_dims': 10, 'reduce': 'PCA', 'cluster_analysis': "smaller"} self.set_params(**kwargs) self.activations_by_class = [] self.clusters_by_class = [] self.assigned_clean_by_class = [] self.is_clean_by_class = [] self.errors_by_class = [] self.red_activations_by_class = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.confidence_level = [] self.poisonous_clusters = [] def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous :type is_clean: :class `list` :param kwargs: a dictionary of defence-specific parameters :type kwargs: `dict` :return: JSON object with confusion matrix :rtype: `jsonObject` """ if not is_clean: raise ValueError("is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations() _, self.assigned_clean_by_class = self.analyze_clusters() # Now check ground truth: self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train) self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(self.assigned_clean_by_class, self.is_clean_by_class) return conf_matrix_json def detect_poison(self, **kwargs): """ Returns poison detected and a report. :param kwargs: a dictionary of detection-specific parameters :type kwargs: `dict` :return: (report, is_clean_lst): where a report is a json object that contains information specified by the clustering analysis technique. where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison :rtype: `tuple` """ self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations() report, self.assigned_clean_by_class = self.analyze_clusters() # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was # determined to be clean by activation cluster # Build an array that matches the original indexes of x_train n_train = len(self.x_train) indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train) self.is_clean_lst = [0] * n_train for assigned_clean, dp in zip(self.assigned_clean_by_class, indices_by_class): for assignment, index_dp in zip(assigned_clean, dp): if assignment == 1: self.is_clean_lst[index_dp] = 1 return report, self.is_clean_lst def cluster_activations(self, **kwargs): """ Clusters activations and returns cluster_by_class and red_activations_by_class, where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the ith class belongs and the correspondent activations reduced by class red_activations_by_class[i][j] :param kwargs: a dictionary of cluster-specific parameters :type kwargs: `dict` :return: clusters per class and activations by class :rtype: `tuple` """ self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class(activations, self.y_train) [self.clusters_by_class, self.red_activations_by_class] = cluster_activations( self.activations_by_class, nb_clusters=self.nb_clusters, nb_dims=self.nb_dims, reduce=self.reduce, clustering_method=self.clustering_method) return self.clusters_by_class, self.red_activations_by_class def analyze_clusters(self, **kwargs): """ This function analyzes the clusters according to the provided method :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: (report, assigned_clean_by_class), where the report is a json object and assigned_clean_by_class is an array of arrays that contains what data points where classified as clean. :rtype: `tuple(json, np.ndarray)` """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() analyzer = ClusteringAnalyzer() if self.cluster_analysis == 'smaller': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_size(self.clusters_by_class) elif self.cluster_analysis == 'relative-size': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_relative_size(self.clusters_by_class) elif self.cluster_analysis == 'distance': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_distance(self.clusters_by_class, separated_activations=self.red_activations_by_class) elif self.cluster_analysis == 'silhouette-scores': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_silhouette_score(self.clusters_by_class, reduced_activations_by_class=self.red_activations_by_class) else: raise ValueError( "Unsupported cluster analysis technique " + self.cluster_analysis) # Add to the report current parameters used to run the defence and the analysis summary report = dict(list(report.items()) + list(self.get_params().items())) import json jreport = json.dumps(report) return jreport, self.assigned_clean_by_class def visualize_clusters(self, x_raw, save=True, folder='.', **kwargs): """ This function creates the sprite/mosaic visualization for clusters. When save=True, it also stores a sprite (mosaic) per cluster in DATA_PATH. :param x_raw: Images used to train the classifier (before pre-processing) :type x_raw: `np.darray` :param save: Boolean specifying if image should be saved :type save: `bool` :param folder: Directory where the sprites will be saved inside DATA_PATH folder :type folder: `str` :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: sprites_by_class: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the sprite of class i cluster j. :rtype: sprites_by_class: `np.ndarray` """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() x_raw_by_class = self._segment_by_class(x_raw, self.y_train) x_raw_by_cluster = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)] # Get all data in x_raw in the right cluster for n_class, cluster in enumerate(self.clusters_by_class): for j, assigned_cluster in enumerate(cluster): x_raw_by_cluster[n_class][assigned_cluster].append(x_raw_by_class[n_class][j]) # Now create sprites: sprites_by_class = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)] for i, class_i in enumerate(x_raw_by_cluster): for j, images_cluster in enumerate(class_i): title = 'Class_' + str(i) + '_cluster_' + str(j) + '_clusterSize_' + str(len(images_cluster)) f_name = title + '.png' f_name = os.path.join(folder, f_name) sprite = create_sprite(images_cluster) if save: save_image(sprite, f_name) sprites_by_class[i][j] = sprite return sprites_by_class def plot_clusters(self, save=True, folder='.', **kwargs): """ Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot. When save=True, it also stores the 3D-plot per cluster in DATA_PATH. :param save: Boolean specifying if image should be saved :type save: `bool` :param folder: Directory where the sprites will be saved inside DATA_PATH folder :type folder: `str` :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: None """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() # Get activations reduced to 3-components: separated_reduced_activations = [] for ac in self.activations_by_class: reduced_activations = reduce_dimensionality(ac, nb_dims=3) separated_reduced_activations.append(reduced_activations) # For each class generate a plot: for class_id, (labels, coordinates) in enumerate(zip(self.clusters_by_class, separated_reduced_activations)): f_name = '' if save: f_name = os.path.join(folder, 'plot_class_' + str(class_id) + '.png') plot_3d(coordinates, labels, save=save, f_name=f_name) def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. :param nb_clusters: Number of clusters to be produced. Should be greater than 2. :type nb_clusters: `int` :param clustering_method: Clustering method to use :type clustering_method: `str` :param nb_dims: Number of dimensions to project on :type nb_dims: `int` :param reduce: Reduction technique :type reduce: `str` :param cluster_analysis: Method to analyze the clusters :type cluster_analysis: `str` """ # Save defence-specific parameters super(ActivationDefence, self).set_params(**kwargs) if self.nb_clusters <= 1: raise ValueError( "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.nb_clusters)) if self.nb_dims <= 0: raise ValueError("Wrong number of dimensions ") if self.clustering_method not in self.valid_clustering: raise ValueError("Unsupported clustering method: " + self.clustering_method) if self.reduce not in self.valid_reduce: raise ValueError("Unsupported reduction method: " + self.reduce) if self.cluster_analysis not in self.valid_analysis: raise ValueError("Unsupported method for cluster analysis method: " + self.cluster_analysis) return True def _get_activations(self): """ Find activations from :class:`.Classifier`. """ logger.info('Getting activations') nb_layers = len(self.classifier.layer_names) activations = self.classifier.get_activations(self.x_train, layer=nb_layers - 1) # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True) nodes_last_layer = np.shape(activations)[1] if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS: logger.warning("Number of activations in last hidden layer is too small. Method may not work properly. " "Size: %s", str(nodes_last_layer)) return activations def _segment_by_class(self, data, features): """ Returns segmented data according to specified features. :param data: to be segmented :type data: `np.ndarray` :param features: features used to segment data, e.g., segment according to predicted label or to `y_train` :type features: `np.ndarray` :return: segmented data according to specified features. :rtype: `list` """ n_classes = self.classifier.nb_classes by_class = [[] for _ in range(n_classes)] for indx, feature in enumerate(features): if n_classes > 2: assigned = np.argmax(feature) else: assigned = int(feature) by_class[assigned].append(data[indx]) return [np.asarray(i) for i in by_class]
class TestGroundTruth(unittest.TestCase): def setUp(self): self.evaluator = GroundTruthEvaluator() self.n_classes = 3 self.n_dp = 10 self.n_dp_mix = 5 self.is_clean_all_clean = [[] for i in range(self.n_classes)] self.is_clean_all_poison = [[] for i in range(self.n_classes)] self.is_clean_mixed = [[] for i in range(self.n_classes)] self.is_clean_comp_mix = [[] for i in range(self.n_classes)] for i in range(self.n_classes): self.is_clean_all_clean[i] = [1] * self.n_dp self.is_clean_all_poison[i] = [0] * self.n_dp self.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0] self.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1] def test_analyze_correct_all_clean(self): # perfect detection all data is actually clean: errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.is_clean_all_clean, self.is_clean_all_clean) json_object = json.loads(conf_matrix_json) self.assertEqual(len(json_object.keys()), self.n_classes) self.assertEqual(len(errors_by_class), self.n_classes) # print(json_object) for i in range(self.n_classes): res_class_i = json_object['class_' + str(i)] self.assertEqual(res_class_i['TruePositive']['rate'], 'N/A') self.assertEqual(res_class_i['TrueNegative']['rate'], 100) self.assertEqual(res_class_i['FalseNegative']['rate'], 'N/A') self.assertEqual(res_class_i['FalsePositive']['rate'], 0) self.assertEqual(res_class_i['TruePositive']['numerator'], 0) self.assertEqual(res_class_i['TruePositive']['denominator'], 0) self.assertEqual(res_class_i['TrueNegative']['numerator'], self.n_dp) self.assertEqual(res_class_i['TrueNegative']['denominator'], self.n_dp) self.assertEqual(res_class_i['FalseNegative']['numerator'], 0) self.assertEqual(res_class_i['FalseNegative']['denominator'], 0) self.assertEqual(res_class_i['FalsePositive']['numerator'], 0) self.assertEqual(res_class_i['FalsePositive']['denominator'], self.n_dp) # all errors_by_class should be 1 (errors_by_class[i] = 1 if marked clean, is clean) for item in errors_by_class[i]: self.assertEqual(item, 1) def test_analyze_correct_all_poison(self): # perfect detection all data is actually poison errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.is_clean_all_poison, self.is_clean_all_poison) json_object = json.loads(conf_matrix_json) self.assertEqual(len(json_object.keys()), self.n_classes) self.assertEqual(len(errors_by_class), self.n_classes) # print(json_object) for i in range(self.n_classes): res_class_i = json_object['class_' + str(i)] self.assertEqual(res_class_i['TruePositive']['rate'], 100) self.assertEqual(res_class_i['TrueNegative']['rate'], 'N/A') self.assertEqual(res_class_i['FalseNegative']['rate'], 0) self.assertEqual(res_class_i['FalsePositive']['rate'], 'N/A') self.assertEqual(res_class_i['TruePositive']['numerator'], self.n_dp) self.assertEqual(res_class_i['TruePositive']['denominator'], self.n_dp) self.assertEqual(res_class_i['TrueNegative']['numerator'], 0) self.assertEqual(res_class_i['TrueNegative']['denominator'], 0) self.assertEqual(res_class_i['FalseNegative']['numerator'], 0) self.assertEqual(res_class_i['FalseNegative']['denominator'], self.n_dp) self.assertEqual(res_class_i['FalsePositive']['numerator'], 0) self.assertEqual(res_class_i['FalsePositive']['denominator'], 0) # all errors_by_class should be 0 (all_errors_by_class[i] = 0 if marked poison, is poison) for item in errors_by_class[i]: self.assertEqual(item, 0) def test_analyze_correct_mixed(self): # perfect detection mixed errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.is_clean_mixed, self.is_clean_mixed) json_object = json.loads(conf_matrix_json) self.assertEqual(len(json_object.keys()), self.n_classes) self.assertEqual(len(errors_by_class), self.n_classes) # print(json_object) for i in range(self.n_classes): res_class_i = json_object['class_' + str(i)] self.assertEqual(res_class_i['TruePositive']['rate'], 100) self.assertEqual(res_class_i['TrueNegative']['rate'], 100) self.assertEqual(res_class_i['FalseNegative']['rate'], 0) self.assertEqual(res_class_i['FalsePositive']['rate'], 0) self.assertEqual(res_class_i['TruePositive']['numerator'], self.n_dp_mix) self.assertEqual(res_class_i['TruePositive']['denominator'], self.n_dp_mix) self.assertEqual(res_class_i['TrueNegative']['numerator'], self.n_dp_mix) self.assertEqual(res_class_i['TrueNegative']['denominator'], self.n_dp_mix) self.assertEqual(res_class_i['FalseNegative']['numerator'], 0) self.assertEqual(res_class_i['FalseNegative']['denominator'], self.n_dp_mix) self.assertEqual(res_class_i['FalsePositive']['numerator'], 0) self.assertEqual(res_class_i['FalsePositive']['denominator'], self.n_dp_mix) # all errors_by_class should be 1 (errors_by_class[i] = 1 if marked clean, is clean) for j, item in enumerate(errors_by_class[i]): self.assertEqual(item, self.is_clean_mixed[i][j]) def test_analyze_fully_misclassified(self): # Completely wrong # order parameters: analyze_correctness(assigned_clean_by_class, is_clean_by_class) errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.is_clean_all_clean, self.is_clean_all_poison) json_object = json.loads(conf_matrix_json) self.assertEqual(len(json_object.keys()), self.n_classes) self.assertEqual(len(errors_by_class), self.n_classes) print(json_object) for i in range(self.n_classes): res_class_i = json_object['class_' + str(i)] self.assertEqual(res_class_i['TruePositive']['rate'], 0) self.assertEqual(res_class_i['TrueNegative']['rate'], 'N/A') self.assertEqual(res_class_i['FalseNegative']['rate'], 100) self.assertEqual(res_class_i['FalsePositive']['rate'], 'N/A') self.assertEqual(res_class_i['TruePositive']['numerator'], 0) self.assertEqual(res_class_i['TruePositive']['denominator'], self.n_dp) self.assertEqual(res_class_i['TrueNegative']['numerator'], 0) self.assertEqual(res_class_i['TrueNegative']['denominator'], 0) self.assertEqual(res_class_i['FalseNegative']['numerator'], self.n_dp) self.assertEqual(res_class_i['FalseNegative']['denominator'], self.n_dp) self.assertEqual(res_class_i['FalsePositive']['numerator'], 0) self.assertEqual(res_class_i['FalsePositive']['denominator'], 0) # all errors_by_class should be 3 (all_errors_by_class[i] = 3 marked clean, is poison) for item in errors_by_class[i]: self.assertEqual(item, 3) def test_analyze_fully_misclassified_rev(self): # Completely wrong # order parameters: analyze_correctness(assigned_clean_by_class, is_clean_by_class) errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.is_clean_all_poison, self.is_clean_all_clean) json_object = json.loads(conf_matrix_json) self.assertEqual(len(json_object.keys()), self.n_classes) self.assertEqual(len(errors_by_class), self.n_classes) pprint.pprint(json_object) for i in range(self.n_classes): res_class_i = json_object['class_' + str(i)] self.assertEqual(res_class_i['TruePositive']['rate'], 'N/A') self.assertEqual(res_class_i['TrueNegative']['rate'], 0) self.assertEqual(res_class_i['FalseNegative']['rate'], 'N/A') self.assertEqual(res_class_i['FalsePositive']['rate'], 100) self.assertEqual(res_class_i['TruePositive']['numerator'], 0) self.assertEqual(res_class_i['TruePositive']['denominator'], 0) self.assertEqual(res_class_i['TrueNegative']['numerator'], 0) self.assertEqual(res_class_i['TrueNegative']['denominator'], self.n_dp) self.assertEqual(res_class_i['FalseNegative']['numerator'], 0) self.assertEqual(res_class_i['FalseNegative']['denominator'], 0) self.assertEqual(res_class_i['FalsePositive']['numerator'], self.n_dp) self.assertEqual(res_class_i['FalsePositive']['denominator'], self.n_dp) # all errors_by_class should be 3 (all_errors_by_class[i] = 2 if marked poison, is clean) for item in errors_by_class[i]: self.assertEqual(item, 2)
class SpectralSignatureDefense(PoisonFilteringDefence): """ Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures """ defence_params = PoisonFilteringDefence.defence_params + [ "classifier", "x_train", "y_train", "batch_size", "eps_multiplier", "ub_pct_poison", ] def __init__(self, classifier, x_train, y_train, **kwargs): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :param x_train: dataset used to train the classifier. :param y_train: labels used to train the classifier. """ super(SpectralSignatureDefense, self).__init__(classifier, x_train, y_train) self.set_params(**kwargs) self.evaluator = GroundTruthEvaluator() def evaluate_defence(self, is_clean, **kwargs): """ If ground truth is known, this function returns a confusion matrix in the form of a JSON object. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ n_classes = self.classifier.nb_classes() if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") is_clean_by_class = SpectralSignatureDefense.split_by_class( is_clean, self.y_train, n_classes) _, predicted_clean = self.detect_poison() predicted_clean_by_class = SpectralSignatureDefense.split_by_class( predicted_clean, self.y_train, n_classes) _, conf_matrix_json = self.evaluator.analyze_correctness( predicted_clean_by_class, is_clean_by_class) return conf_matrix_json def detect_poison(self, **kwargs): """ Returns poison detected and a report. :return: (report, is_clean_lst): where a report is None (for future ART compatibility) where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ self.set_params(**kwargs) n_classes = self.classifier.nb_classes() nb_layers = len(self.classifier.layer_names) features_x_poisoned = self.classifier.get_activations( self.x_train, layer=nb_layers - 1, batch_size=self.batch_size) features_split = SpectralSignatureDefense.split_by_class( features_x_poisoned, self.y_train, n_classes) keep_by_class = [] for idx, feature in enumerate(features_split): score = SpectralSignatureDefense.spectral_signature_scores(feature) score_cutoff = np.quantile( score, max(1 - self.eps_multiplier * self.ub_pct_poison, 0.0)) keep_by_class.append(score < score_cutoff) base_indices_by_class = SpectralSignatureDefense.split_by_class( np.arange(self.y_train.shape[0]), self.y_train, 10) is_clean_lst = np.zeros_like(self.y_train, dtype=np.int) for keep_booleans, indices in zip(keep_by_class, base_indices_by_class): for keep_boolean, idx in zip(keep_booleans, indices): if keep_boolean: is_clean_lst[idx] = 1 return None, is_clean_lst @staticmethod def spectral_signature_scores(R): """ :param R: Matrix of feature representations :return: Outlier scores for each observation based on spectral signature """ M = R - np.mean(R, axis=0) # Following Algorithm #1, use SVD of centered features, not of covariance _, _, v = np.linalg.svd(M, full_matrices=False) eigs = v[:1] score = np.matmul(M, np.transpose(eigs))**2 return score @staticmethod def split_by_class(data, labels, num_classes): """ :param data: Iterable of features :param labels: Labels, not in one-hot representations :param num_classes: Number of classes of labels :return: List of numpy arrays of features split by labels """ split = [[] for _ in range(num_classes)] for idx, label in enumerate(labels): split[int(label)].append(data[idx]) return [np.asarray(dat) for dat in split] def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defense-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. """ # Save defence-specific parameters super(SpectralSignatureDefense, self).set_params(**kwargs) return True
class ActivationDefence(PoisonFilteringDefence): """ Method from [Chen et al., 2018] performing poisoning detection based on activations clustering. Paper link: https://arxiv.org/abs/1811.03728 """ defence_params = [ 'nb_clusters', 'clustering_method', 'nb_dims', 'reduce', 'cluster_analysis' ] valid_clustering = ['KMeans'] valid_reduce = ['PCA', 'FastICA', 'TSNE'] valid_analysis = [ 'smaller', 'distance', 'relative-size', 'silhouette-scores' ] TOO_SMALL_ACTIVATIONS = 32 # Threshold used to print a warning when activations are not enough def __init__(self, classifier, x_train, y_train): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :type classifier: :class:`.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` """ super(ActivationDefence, self).__init__(classifier, x_train, y_train) kwargs = { 'nb_clusters': 2, 'clustering_method': "KMeans", 'nb_dims': 10, 'reduce': 'PCA', 'cluster_analysis': "smaller" } self.set_params(**kwargs) self.activations_by_class = [] self.clusters_by_class = [] self.assigned_clean_by_class = [] self.is_clean_by_class = [] self.errors_by_class = [] self.red_activations_by_class = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.confidence_level = [] self.poisonous_clusters = [] def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :type is_clean: :class `np.ndarray` :param kwargs: A dictionary of defence-specific parameters. :type kwargs: `dict` :return: JSON object with confusion matrix. :rtype: `jsonObject` """ if is_clean is None or len(is_clean) == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations( ) _, self.assigned_clean_by_class = self.analyze_clusters() # Now check ground truth: self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train) self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_class, self.is_clean_by_class) return conf_matrix_json def detect_poison(self, **kwargs): """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :type kwargs: `dict` :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the clustering analysis technique. where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. :rtype: `tuple` """ self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations( ) report, self.assigned_clean_by_class = self.analyze_clusters() # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was # determined to be clean by activation cluster # Build an array that matches the original indexes of x_train n_train = len(self.x_train) indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train) self.is_clean_lst = [0] * n_train for assigned_clean, dp in zip(self.assigned_clean_by_class, indices_by_class): for assignment, index_dp in zip(assigned_clean, dp): if assignment == 1: self.is_clean_lst[index_dp] = 1 return report, self.is_clean_lst def cluster_activations(self, **kwargs): """ Clusters activations and returns cluster_by_class and red_activations_by_class, where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the ith class belongs and the correspondent activations reduced by class red_activations_by_class[i][j]. :param kwargs: A dictionary of cluster-specific parameters. :type kwargs: `dict` :return: Clusters per class and activations by class. :rtype: `tuple` """ self.set_params(**kwargs) if not self.activations_by_class: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) [self.clusters_by_class, self.red_activations_by_class ] = cluster_activations(self.activations_by_class, nb_clusters=self.nb_clusters, nb_dims=self.nb_dims, reduce=self.reduce, clustering_method=self.clustering_method) return self.clusters_by_class, self.red_activations_by_class def analyze_clusters(self, **kwargs): """ This function analyzes the clusters according to the provided method. :param kwargs: A dictionary of cluster-analysis-specific parameters. :type kwargs: `dict` :return: (report, assigned_clean_by_class), where the report is a dict object and assigned_clean_by_class is an array of arrays that contains what data points where classified as clean. :rtype: `tuple(dict, np.ndarray)` """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() analyzer = ClusteringAnalyzer() if self.cluster_analysis == 'smaller': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_size(self.clusters_by_class) elif self.cluster_analysis == 'relative-size': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_relative_size(self.clusters_by_class) elif self.cluster_analysis == 'distance': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_distance(self.clusters_by_class, separated_activations=self.red_activations_by_class) elif self.cluster_analysis == 'silhouette-scores': self.assigned_clean_by_class, self.poisonous_clusters, report \ = analyzer.analyze_by_silhouette_score(self.clusters_by_class, reduced_activations_by_class=self.red_activations_by_class) else: raise ValueError("Unsupported cluster analysis technique " + self.cluster_analysis) # Add to the report current parameters used to run the defence and the analysis summary report = dict(list(report.items()) + list(self.get_params().items())) return report, self.assigned_clean_by_class @staticmethod def relabel_poison_ground_truth(classifier, x, y_fix, test_set_split=0.7, tolerable_backdoor=0.01, max_epochs=50, batch_epochs=10): """ Revert poison attack by continue training the current classifier with `x`, `y_fix`. `test_set_split` determines the percentage in x that will be used as training set, while `1-test_set_split` determines how many data points to use for test set. :param classifier: Classifier to be fixed :type classifier: :class:`.Classifier` :param x: samples :type x: `np.ndarray` :param y_fix: true label of x_poison :type y_fix: `np.ndarray` :param test_set_split: this parameter determine how much data goes to the training set. Here `test_set_split*len(y_fix)` determines the number of data points in `x_train` and `(1-test_set_split) * len(y_fix)` the number of data points in `x_test`. :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate. :type tolerable_backdoor: `float` :param max_epochs: Maximum number of epochs that the model will be trained :type max_epochs: `int` :param batch_epochs: Number of epochs to be trained before checking current state of model :type batch_epochs: `int` :return: (improve_factor, classifier) :rtype: `float`, `.Classifier` """ # Split data into testing and training: n_train = int(len(x) * test_set_split) x_train, x_test = x[:n_train], x[n_train:] y_train, y_test = y_fix[:n_train], y_fix[n_train:] import time filename = 'original_classifier' + str(time.time()) + '.p' ActivationDefence._pickle_classifier(classifier, filename) # Now train using y_fix: improve_factor, fixed_classifier = train_remove_backdoor( classifier, x_train, y_train, x_test, y_test, tolerable_backdoor=tolerable_backdoor, max_epochs=max_epochs, batch_epochs=batch_epochs) # Only update classifier if there was an improvement: if improve_factor < 0: classifier = ActivationDefence._unpickle_classifier(filename) return 0, classifier ActivationDefence._remove_pickle(filename) return improve_factor, classifier @staticmethod def relabel_poison_cross_validation(classifier, x, y_fix, n_splits=10, tolerable_backdoor=0.01, max_epochs=50, batch_epochs=10): """ Revert poison attack by continue training the current classifier with `x`, `y_fix`. `n_splits` determine the number of cross validation splits. :param classifier: Classifier to be fixed :type classifier: :class:`.Classifier` :param x: Samples that were miss-labeled. :type x: `np.ndarray` :param y_fix: True label of `x`. :type y_fix: `np.ndarray` :param n_splits: Determines how many splits to use in cross validation (only used if `cross_validation=True`). :type n_splits: `int` :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate. :type tolerable_backdoor: `float` :param max_epochs: Maximum number of epochs that the model will be trained. :type max_epochs: `int` :param batch_epochs: Number of epochs to be trained before checking current state of model. :type batch_epochs: `int` :return: (improve_factor, classifier) :rtype: `float`, `.Classifier` """ # Train using cross validation from sklearn.model_selection import KFold kf = KFold(n_splits=n_splits) KFold(n_splits=n_splits, random_state=None, shuffle=True) import time filename = 'original_classifier' + str(time.time()) + '.p' ActivationDefence._pickle_classifier(classifier, filename) curr_improvement = 0 for i, (train_index, test_index) in enumerate(kf.split(x)): # Obtain partition: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y_fix[train_index], y_fix[test_index] # Unpickle original model: curr_classifier = ActivationDefence._unpickle_classifier(filename) new_improvement, fixed_classifier = train_remove_backdoor( curr_classifier, x_train, y_train, x_test, y_test, tolerable_backdoor=tolerable_backdoor, max_epochs=max_epochs, batch_epochs=batch_epochs) if curr_improvement < new_improvement and new_improvement > 0: curr_improvement = new_improvement classifier = fixed_classifier logger.info('Selected as best model so far: ' + str(curr_improvement)) ActivationDefence._remove_pickle(filename) return curr_improvement, classifier @staticmethod def _pickle_classifier(classifier, file_name): """ Pickles the self.classifier and stores it using the provided file_name in folder `art.DATA_PATH`. :param classifier: Classifier to be pickled. :type classifier: :class:`.Classifier` :param file_name: Name of the file where the classifier will be pickled :return: None """ import pickle import os from art import DATA_PATH full_path = os.path.join(DATA_PATH, file_name) folder = os.path.split(full_path)[0] if not os.path.exists(folder): os.makedirs(folder) with open(full_path, 'wb') as f: pickle.dump(classifier, f) @staticmethod def _unpickle_classifier(file_name): """ Unpickles classifier using the filename provided. Function assumes that the pickle is in `art.DATA_PATH`. :param file_name: :return: """ import os from art import DATA_PATH import pickle full_path = os.path.join(DATA_PATH, file_name) logger.info('Loading classifier from ' + str(full_path)) with open(full_path, 'rb') as f: loaded_classifier = pickle.load(f) return loaded_classifier @staticmethod def _remove_pickle(file_name): """ Erases the pickle with the provided file name :param file_name: File name without directory :return: None """ import os from art import DATA_PATH full_path = os.path.join(DATA_PATH, file_name) os.remove(full_path) def visualize_clusters(self, x_raw, save=True, folder='.', **kwargs): """ This function creates the sprite/mosaic visualization for clusters. When save=True, it also stores a sprite (mosaic) per cluster in DATA_PATH. :param x_raw: Images used to train the classifier (before pre-processing) :type x_raw: `np.darray` :param save: Boolean specifying if image should be saved :type save: `bool` :param folder: Directory where the sprites will be saved inside DATA_PATH folder :type folder: `str` :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: sprites_by_class: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the sprite of class i cluster j. :rtype: sprites_by_class: `np.ndarray` """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() x_raw_by_class = self._segment_by_class(x_raw, self.y_train) x_raw_by_cluster = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)] # Get all data in x_raw in the right cluster for n_class, cluster in enumerate(self.clusters_by_class): for j, assigned_cluster in enumerate(cluster): x_raw_by_cluster[n_class][assigned_cluster].append( x_raw_by_class[n_class][j]) # Now create sprites: sprites_by_class = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)] for i, class_i in enumerate(x_raw_by_cluster): for j, images_cluster in enumerate(class_i): title = 'Class_' + str(i) + '_cluster_' + str( j) + '_clusterSize_' + str(len(images_cluster)) f_name = title + '.png' f_name = os.path.join(folder, f_name) sprite = create_sprite(images_cluster) if save: save_image(sprite, f_name) sprites_by_class[i][j] = sprite return sprites_by_class def plot_clusters(self, save=True, folder='.', **kwargs): """ Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot. When save=True, it also stores the 3D-plot per cluster in DATA_PATH. :param save: Boolean specifying if image should be saved :type save: `bool` :param folder: Directory where the sprites will be saved inside DATA_PATH folder :type folder: `str` :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: None """ self.set_params(**kwargs) if not self.clusters_by_class: self.cluster_activations() # Get activations reduced to 3-components: separated_reduced_activations = [] for ac in self.activations_by_class: reduced_activations = reduce_dimensionality(ac, nb_dims=3) separated_reduced_activations.append(reduced_activations) # For each class generate a plot: for class_id, (labels, coordinates) in enumerate( zip(self.clusters_by_class, separated_reduced_activations)): f_name = '' if save: f_name = os.path.join(folder, 'plot_class_' + str(class_id) + '.png') plot_3d(coordinates, labels, save=save, f_name=f_name) def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. :param nb_clusters: Number of clusters to be produced. Should be greater than 2. :type nb_clusters: `int` :param clustering_method: Clustering method to use :type clustering_method: `str` :param nb_dims: Number of dimensions to project on :type nb_dims: `int` :param reduce: Reduction technique :type reduce: `str` :param cluster_analysis: Method to analyze the clusters :type cluster_analysis: `str` """ # Save defence-specific parameters super(ActivationDefence, self).set_params(**kwargs) if self.nb_clusters <= 1: raise ValueError( "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.nb_clusters)) if self.nb_dims <= 0: raise ValueError("Wrong number of dimensions ") if self.clustering_method not in self.valid_clustering: raise ValueError("Unsupported clustering method: " + self.clustering_method) if self.reduce not in self.valid_reduce: raise ValueError("Unsupported reduction method: " + self.reduce) if self.cluster_analysis not in self.valid_analysis: raise ValueError( "Unsupported method for cluster analysis method: " + self.cluster_analysis) return True def _get_activations(self): """ Find activations from :class:`.Classifier`. """ logger.info('Getting activations') nb_layers = len(self.classifier.layer_names) activations = self.classifier.get_activations(self.x_train, layer=nb_layers - 1) # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True) nodes_last_layer = np.shape(activations)[1] if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS: logger.warning( "Number of activations in last hidden layer is too small. Method may not work properly. " "Size: %s", str(nodes_last_layer)) return activations def _segment_by_class(self, data, features): """ Returns segmented data according to specified features. :param data: to be segmented :type data: `np.ndarray` :param features: features used to segment data, e.g., segment according to predicted label or to `y_train` :type features: `np.ndarray` :return: segmented data according to specified features. :rtype: `list` """ n_classes = self.classifier.nb_classes by_class = [[] for _ in range(n_classes)] for indx, feature in enumerate(features): if n_classes > 2: assigned = np.argmax(feature) else: assigned = int(feature) by_class[assigned].append(data[indx]) return [np.asarray(i) for i in by_class]
class ProvenanceDefense(PoisonFilteringDefence): """ Implements methods performing poisoning detection based on data provenance. | Paper link: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8473440 """ defence_params = [ 'classifier', 'x_train', 'y_train', 'p_train', 'x_val', 'y_val', 'eps', 'perf_func', 'pp_valid' ] def __init__(self, classifier, x_train, y_train, p_train, x_val=None, y_val=None, eps=0.2, perf_func='accuracy', pp_valid=0.2, **kwargs): """ Create an :class:`.ProvenanceDefense` object with the provided classifier. :param classifier: Model evaluated for poison. :type classifier: :class:`art.classifiers.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` :param p_train: provenance features for each training data point as one hot vectors :type p_train: `np.ndarray` :param x_val: validation data for defense (optional) :type x_val: `np.ndarray` :param y_val: validation labels for defense (optional) :type y_val: `np.ndarray` :param eps: threshold for performance shift in suspicious data :type eps: `float` :param perf_func: performance function used to evaluate effectiveness of defense :type eps: `str` or `callable` :param pp_valid: The percent of training data to use as validation data (for defense without validation data) :type eps: `str` or `callable` """ super(ProvenanceDefense, self).__init__(classifier, x_train, y_train) self.p_train = p_train self.num_devices = self.p_train.shape[1] self.x_val = x_val self.y_val = y_val self.eps = eps self.perf_func = perf_func self.pp_valid = pp_valid self.assigned_clean_by_device = [] self.is_clean_by_device = [] self.errors_by_device = [] self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.set_params(**kwargs) def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :type is_clean: :class `np.ndarray` :param kwargs: A dictionary of defence-specific parameters. :type kwargs: `dict` :return: JSON object with confusion matrix. :rtype: `jsonObject` """ if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.assigned_clean_by_device: self.detect_poison() self.is_clean_by_device = segment_by_class(is_clean, self.p_train, self.num_devices) self.errors_by_device, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_device, self.is_clean_by_device) return conf_matrix_json def detect_poison(self, **kwargs): """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :type kwargs: `dict` :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the provenance detection method where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. :rtype: `tuple` """ self.set_params(**kwargs) if self.x_val is None: report = self.detect_poison_untrusted() else: report = self.detect_poison_partially_trusted() n_train = len(self.x_train) indices_by_provenance = segment_by_class(np.arange(n_train), self.p_train, self.num_devices) self.is_clean_lst = np.array([1] * n_train) for device in report: self.is_clean_lst[indices_by_provenance[device]] = 0 self.assigned_clean_by_device = segment_by_class( np.array(self.is_clean_lst), self.p_train, self.num_devices) return report, self.is_clean_lst def detect_poison_partially_trusted(self, **kwargs): """ Detect poison given trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences :rtype: `dict` """ self.set_params(**kwargs) if self.x_val is None or self.y_val is None: raise ValueError("Trusted data unavailable") suspected = {} unfiltered_data = np.copy(self.x_train) unfiltered_labels = np.copy(self.y_train) segments = segment_by_class(self.x_train, self.p_train, self.num_devices) for device_idx, segment in enumerate(segments): filtered_data, filtered_labels = self.filter_input( unfiltered_data, unfiltered_labels, segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(unfiltered_data, unfiltered_labels) filtered_model.fit(filtered_data, filtered_labels) var_w = performance_diff(filtered_model, unfiltered_model, self.x_val, self.y_val, perf_function=self.perf_func) if self.eps < var_w: suspected[device_idx] = var_w unfiltered_data = filtered_data unfiltered_labels = filtered_labels return suspected def detect_poison_untrusted(self, **kwargs): """ Detect poison given no trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences :rtype: `dict` """ self.set_params(**kwargs) suspected = {} train_data, valid_data, train_labels, valid_labels, train_prov, valid_prov = \ train_test_split(self.x_train, self.y_train, self.p_train, test_size=self.pp_valid) train_segments = segment_by_class(train_data, train_prov, self.num_devices) valid_segments = segment_by_class(valid_data, valid_prov, self.num_devices) for device_idx, (train_segment, valid_segment) in enumerate( zip(train_segments, valid_segments)): filtered_data, filtered_labels = self.filter_input( train_data, train_labels, train_segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(train_data, train_labels) filtered_model.fit(filtered_data, filtered_labels) valid_non_device_data, valid_non_device_labels = \ self.filter_input(valid_data, valid_labels, valid_segment) var_w = performance_diff(filtered_model, unfiltered_model, valid_non_device_data, valid_non_device_labels, perf_function=self.perf_func) if self.eps < var_w: suspected[device_idx] = var_w train_data = filtered_data train_labels = filtered_labels valid_data = valid_non_device_data valid_labels = valid_non_device_labels return suspected @staticmethod def filter_input(data, labels, segment): """ Return the data and labels that are not part of a specified segment :param data: The data to segment :type data: `np.ndarray` :param labels: The corresponding labels to segment :type labels: `np.ndarray` :param segment: :return: tupe of (filtered_data, filtered_labels) :rtype: (`np.ndarray`, `np.ndarray`) """ filter_mask = np.array([ np.isin(data[i, :], segment, invert=True).any() for i in range(data.shape[0]) ]) filtered_data = data[filter_mask] filtered_labels = labels[filter_mask] return filtered_data, filtered_labels def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. """ # Save defence-specific parameters super(ProvenanceDefense, self).set_params(**kwargs) if self.eps < 0: raise ValueError("Value of epsilon must be at least 0") if self.pp_valid < 0: raise ValueError("Value of pp_valid must be at least 0") if len(self.x_train) != len(self.y_train): raise ValueError("x_train and y_train do not match in shape") if len(self.x_train) != len(self.p_train): raise ValueError("Provenance features do not match data") return True
class RONIDefense(PoisonFilteringDefence): """ Close implementation based on description in Nelson 'Behavior of Machine Learning Algorithms in Adversarial Environments' Ch. 4.4 | Textbook link: https://people.eecs.berkeley.edu/~adj/publications/paper-files/EECS-2010-140.pdf """ defence_params = [ 'classifier', 'x_train', 'y_train', 'x_val', 'y_val', 'perf_func', 'calibrated', 'eps' ] def __init__(self, classifier, x_train, y_train, x_val, y_val, perf_func='accuracy', pp_cal=0.2, pp_quiz=0.2, calibrated=True, eps=0.1, **kwargs): """ Create an :class:`.ActivationDefence` object with the provided classifier. :param classifier: Model evaluated for poison. :type classifier: :class:`art.classifiers.Classifier` :param x_train: dataset used to train the classifier. :type x_train: `np.ndarray` :param y_train: labels used to train the classifier. :type y_train: `np.ndarray` :param x_val: trusted data points :type x_val: `np.ndarray` :param y_train: trusted data labels :type y_train: `np.ndarray` :param perf_func: performance function to use :type perf_func: `str` or `callable` :param pp_cal: percent of training data used for calibration :type pp_cal: `float` :param pp_quiz: percent of training data used for quiz set :type pp_quiz: `float` :param calibrated: True if using the calibrated form of RONI :type calibrated: `bool` :param eps: performance threshold if using uncalibrated RONI :type eps: `float` """ super(RONIDefense, self).__init__(classifier, x_train, y_train) n_points = len(x_train) quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points)) self.calibrated = calibrated self.x_quiz = np.copy(self.x_train[quiz_idx]) self.y_quiz = np.copy(self.y_train[quiz_idx]) if self.calibrated: _, self.x_cal, _, self.y_cal = train_test_split(self.x_train, self.y_train, test_size=pp_cal, shuffle=True) self.eps = eps self.evaluator = GroundTruthEvaluator() self.x_val = x_val self.y_val = y_val self.perf_func = perf_func self.is_clean_lst = list() self.set_params(**kwargs) def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :type is_clean: :class `np.ndarray` :param kwargs: A dictionary of defence-specific parameters. :type kwargs: `dict` :return: JSON object with confusion matrix. :rtype: `jsonObject` """ self.set_params(**kwargs) if len(self.is_clean_lst) == 0: self.detect_poison() if is_clean is None or len(is_clean) != len(self.is_clean_lst): raise ValueError("Invalid value for is_clean.") _, conf_matrix = self.evaluator.analyze_correctness( [self.is_clean_lst], [is_clean]) return conf_matrix def detect_poison(self, **kwargs): """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :type kwargs: `dict` :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the provenance detection method where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. :rtype: `tuple` """ self.set_params(**kwargs) x_suspect = self.x_train y_suspect = self.y_train x_trusted = self.x_val y_trusted = self.y_val self.is_clean_lst = [1 for _ in range(len(x_suspect))] report = {} before_classifier = deepcopy(self.classifier) before_classifier.fit(x_suspect, y_suspect) for idx in np.random.permutation(len(x_suspect)): x_i = x_suspect[idx] y_i = y_suspect[idx] after_classifier = deepcopy(before_classifier) after_classifier.fit(x=np.vstack([x_trusted, x_i]), y=np.vstack([y_trusted, y_i])) acc_shift = performance_diff(before_classifier, after_classifier, self.x_quiz, self.y_quiz, perf_function=self.perf_func) # print(acc_shift, median, std_dev) if self.is_suspicious(before_classifier, acc_shift): self.is_clean_lst[idx] = 0 report[idx] = acc_shift else: before_classifier = after_classifier x_trusted = np.vstack([x_trusted, x_i]) y_trusted = np.vstack([y_trusted, y_i]) return report, self.is_clean_lst def is_suspicious(self, before_classifier, perf_shift): """ Returns True if a given performance shift is suspicious :param before_classifier: The classifier without untrusted data :type before_classifier: `art.classifiers.classifier.Classifier` :param perf_shift: a shift in performance :type perf_shift: `float` :return: True if a given performance shift is suspicious. False otherwise. :rtype: `bool` """ if self.calibrated: median, std_dev = self.get_calibration_info(before_classifier) return perf_shift < median - 3 * std_dev return perf_shift < -self.eps def get_calibration_info(self, before_classifier): """ Calculate the median and standard deviation of the accuracy shifts caused by the calibration set. :param before_classifier: The classifier trained without suspicious point :type before_classifier: `art.classifiers.classifier.Classifier` :return: a tuple consisting of (`median`, `std_dev`) :rtype: (`float`, `float`) """ accs = [] for x_c, y_c in zip(self.x_cal, self.y_cal): after_classifier = deepcopy(before_classifier) after_classifier.fit(x=np.vstack([self.x_val, x_c]), y=np.vstack([self.y_val, y_c])) accs.append( performance_diff(before_classifier, after_classifier, self.x_quiz, self.y_quiz, perf_function=self.perf_func)) return np.median(accs), np.std(accs) def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. """ super(RONIDefense, self).set_params(**kwargs) if len(self.x_train) != len(self.y_train): raise ValueError("x_train and y_train do not match shape") if self.eps < 0: raise ValueError("Value of epsilon must be at least 0") return True
class ActivationDefence(PoisonFilteringDefence): """ Class performing Activation Analysis Defence """ defence_params = [ 'n_clusters', 'clustering_method', 'ndims', 'reduce', 'cluster_analysis' ] valid_clustering = ['KMeans'] valid_reduce = ['PCA', 'FastICA', 'TSNE'] valid_analysis = ['smaller', 'distance'] TOO_SMALL_ACTIVATIONS = 32 # Threshold used to print a warning when activations are not enough def __init__(self, classifier, x_train, y_train, verbose=True): """ Create an ActivationDefence object with the provided classifier :param classifier: model evaluated for poison :type classifier: :class:`Classifier` :param x_train: dataset used to train `classifier` :type x_train: :class:`numpy.ndarray` :param y_train: labels used to train `classifier` :type y_train: :class:`numpy.ndarray` :param verbose: When True prints more information :type verbose: `bool` """ super(ActivationDefence, self).__init__(classifier, x_train, y_train, verbose) kwargs = { 'n_clusters': 2, 'clustering_method': "KMeans", 'ndims': 10, 'reduce': 'PCA', 'cluster_analysis': "smaller" } self.set_params(**kwargs) self.activations_by_class = [] self.clusters_by_class = [] self.assigned_clean_by_class = [] self.is_clean_by_class = [] self.errors_by_class = [] self.red_activations_by_class = [] # Activations reduced by class self.evaluator = GroundTruthEvaluator() self.is_clean_lst = [] self.confidence_level = [] def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous :type is_clean: :class `list` :param kwargs: a dictionary of defence-specific parameters :type kwargs: `dict` :return: JSON object with confusion matrix """ self.set_params(**kwargs) if len(self.activations_by_class) == 0: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations( ) self.assigned_clean_by_class = self.analyze_clusters() # Now check ground truth: self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train) self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_class, self.is_clean_by_class, verbose=self.verbose) return conf_matrix_json def detect_poison(self, **kwargs): """ Returns poison detected. :param kwargs: a dictionary of detection-specific parameters :type kwargs: `dict` :return: 1) confidence_level, 2) is_clean_lst : type List[int], where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison :rtype: `tuple` """ self.set_params(**kwargs) if len(self.activations_by_class) == 0: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) self.clusters_by_class, self.red_activations_by_class = self.cluster_activations( ) self.assigned_clean_by_class = self.analyze_clusters() # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was # determined to be clean by activation cluster # Build an array that matches the original indexes of x_train n_train = len(self.x_train) indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train) self.is_clean_lst = [0] * n_train self.confidence_level = [1] * n_train for i, (assigned_clean, dp) in enumerate( zip(self.assigned_clean_by_class, indices_by_class)): for j, (assignment, index_dp) in enumerate(zip(assigned_clean, dp)): if assignment == 1: self.is_clean_lst[index_dp] = 1 return self.confidence_level, self.is_clean_lst def cluster_activations(self, **kwargs): """ Clusters activations and returns cluster_by_class and red_activations_by_class, where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the ith class belongs and the correspondent activations reduced by class red_activations_by_class[i][j] :param kwargs: a dictionary of cluster-specific parameters :type kwargs: `dict` :return: `tuple` """ self.set_params(**kwargs) if len(self.activations_by_class) == 0: activations = self._get_activations() self.activations_by_class = self._segment_by_class( activations, self.y_train) my_clust = ClusteringHandler() [self.clusters_by_class, self.red_activations_by_class] = my_clust.cluster_activations( self.activations_by_class, n_clusters=self.n_clusters, ndims=self.ndims, reduce=self.reduce, clustering_method=self.clustering_method) return self.clusters_by_class, self.red_activations_by_class def analyze_clusters(self, **kwargs): """ This function analyzes the clusters according to the provided method :param kwargs: a dictionary of cluster-analysis-specific parameters :type kwargs: `dict` :return: Assigned_clean_by_class, an array of arrays that contains what data points where classified as clean. """ self.set_params(**kwargs) if len(self.clusters_by_class) == 0: self.cluster_activations() if self.cluster_analysis == 'smaller': analyzer = SizeAnalyzer() self.assigned_clean_by_class = analyzer.analyze_clusters( self.clusters_by_class) elif self.cluster_analysis == 'distance': analyzer = DistanceAnalyzer() self.assigned_clean_by_class = analyzer.analyze_clusters( self.clusters_by_class, separated_activations=self.red_activations_by_class) return self.assigned_clean_by_class def set_params(self, **kwargs): """ Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes. If a parameter is not provided, it takes its default value. :param n_clusters: Number of clusters to be produced. Should be greater than 2. :type n_clusters: `int` :param clustering_method: Clustering method to use :type clustering_method: `string` :param ndims: Number of dimensions to project on :type ndims: `int` :param reduce: Reduction technique :type reduce: `str` :param cluster_analysis: Method to analyze the clusters :type cluster_analysis: `str` """ # Save defence-specific parameters super(ActivationDefence, self).set_params(**kwargs) if self.n_clusters <= 1: raise ValueError( "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.n_clusters)) return False if self.ndims <= 0: raise ValueError("Wrong number of dimensions ") return False if self.clustering_method not in self.valid_clustering: raise ValueError("Unsupported clustering method: " + self.clustering_method) return False if self.reduce not in self.valid_reduce: raise ValueError("Unsupported reduction method: " + self.reduce) return False if self.cluster_analysis not in self.valid_analysis: raise ValueError( "Unsupported method for cluster analysis method: " + self.cluster_analysis) return False return True def _get_activations(self): """ Find activations from class:Classifier """ print('Getting activations..') nb_layers = len(self.classifier.layer_names) activations = self.classifier.get_activations(self.x_train, layer=nb_layers - 1) # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True) nodes_last_layer = np.shape(activations)[1] if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS: print( "WARNING: Number of activations in last layer is too small... method may not work properly. " "Size: " + str(nodes_last_layer)) return activations def _segment_by_class(self, data, features): """ Returns segmented data according to specified features :param data: to be segmented :type data: :class:`numpy.ndarray` :param features: features used to segment data e.g., segment according to predicted label or to y_train :type features: class:`numpy.ndarray` """ n_classes = self.classifier.nb_classes by_class = [[] for i in range(n_classes)] for indx, feature in enumerate(features): if n_classes > 2: assigned = np.argmax(feature) else: assigned = int(feature) by_class[assigned].append(data[indx]) return [np.asarray(i) for i in by_class]