def detect_poison(self, **kwargs) -> Tuple[dict, np.ndarray]: """ Returns poison detected and a report. :param kwargs: A dictionary of detection-specific parameters. :return: (report, is_clean_lst): where a report is a dict object that contains information specified by the provenance detection method where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. :rtype: `tuple` """ self.set_params(**kwargs) if self.x_val is None: report = self.detect_poison_untrusted() else: report = self.detect_poison_partially_trusted() n_train = len(self.x_train) indices_by_provenance = segment_by_class(np.arange(n_train), self.p_train, self.num_devices) self.is_clean_lst = np.array([1] * n_train) for device in report: self.is_clean_lst[indices_by_provenance[device]] = 0 self.assigned_clean_by_device = segment_by_class( np.array(self.is_clean_lst), self.p_train, self.num_devices) return report, self.is_clean_lst
def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]: """ Returns poison detected and a report. :return: (report, is_clean_lst): where a report is a dictionary containing the index as keys the outlier score of suspected poisons as values where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i] there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison. """ self.set_params(**kwargs) if self.classifier.layer_names is not None: nb_layers = len(self.classifier.layer_names) else: raise ValueError("No layer names identified.") features_x_poisoned = self.classifier.get_activations( self.x_train, layer=nb_layers - 1, batch_size=self.batch_size) features_split = segment_by_class(features_x_poisoned, self.y_train_sparse, self.classifier.nb_classes) score_by_class = [] keep_by_class = [] for idx, feature in enumerate(features_split): # Check for empty list if len(feature): # pylint: disable=C1801 score = SpectralSignatureDefense.spectral_signature_scores( np.vstack(feature)) score_cutoff = np.quantile( score, max(1 - self.eps_multiplier * self.expected_pp_poison, 0.0)) score_by_class.append(score) keep_by_class.append(score < score_cutoff) else: score_by_class.append([0]) keep_by_class.append([True]) base_indices_by_class = segment_by_class( np.arange(len(self.y_train_sparse)), self.y_train_sparse, self.classifier.nb_classes, ) is_clean_lst = [0] * len(self.y_train_sparse) report = {} for keep_booleans, all_scores, indices in zip(keep_by_class, score_by_class, base_indices_by_class): for keep_boolean, all_score, idx in zip(keep_booleans, all_scores, indices): if keep_boolean: is_clean_lst[idx] = 1 else: report[idx] = all_score[0] return report, is_clean_lst
def detect_poison_untrusted(self, **kwargs) -> Dict[int, float]: """ Detect poison given no trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences """ self.set_params(**kwargs) suspected = {} ( train_data, valid_data, train_labels, valid_labels, train_prov, valid_prov, ) = train_test_split(self.x_train, self.y_train, self.p_train, test_size=self.pp_valid) train_segments = segment_by_class(train_data, train_prov, self.num_devices) valid_segments = segment_by_class(valid_data, valid_prov, self.num_devices) for device_idx, (train_segment, valid_segment) in enumerate( zip(train_segments, valid_segments)): filtered_data, filtered_labels = self.filter_input( train_data, train_labels, train_segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(train_data, train_labels) filtered_model.fit(filtered_data, filtered_labels) valid_non_device_data, valid_non_device_labels = self.filter_input( valid_data, valid_labels, valid_segment) var_w = performance_diff( filtered_model, unfiltered_model, valid_non_device_data, valid_non_device_labels, perf_function=self.perf_func, ) if self.eps < var_w: suspected[device_idx] = var_w train_data = filtered_data train_labels = filtered_labels valid_data = valid_non_device_data valid_labels = valid_non_device_labels return suspected
def test_segment_by_class(self): data = np.array([[3, 2], [9, 2], [4, 0], [9, 0]]) classes = to_categorical(np.array([2, 1, 0, 1])) num_classes = 3 segments = segment_by_class(data, classes, num_classes) self.assertEqual(len(segments), num_classes) self.assertEqual(len(segments[1]), 2) self.assertTrue(np.all(np.equal(segments[0], np.array([data[2]])))) self.assertTrue(np.all(np.equal(segments[1], np.array([data[1], data[3]])))) self.assertTrue(np.all(np.equal(segments[2], np.array([data[0]])))) num_classes = 4 segments = segment_by_class(data, classes, num_classes) self.assertEqual(len(segments), num_classes)
def evaluate_defence(self, is_clean, **kwargs): """ Returns confusion matrix. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :type is_clean: :class `np.ndarray` :param kwargs: A dictionary of defence-specific parameters. :type kwargs: `dict` :return: JSON object with confusion matrix. :rtype: `jsonObject` """ if is_clean is None or is_clean.size == 0: raise ValueError( "is_clean was not provided while invoking evaluate_defence.") self.set_params(**kwargs) if not self.assigned_clean_by_device: self.detect_poison() self.is_clean_by_device = segment_by_class(is_clean, self.p_train, self.num_devices) self.errors_by_device, conf_matrix_json = self.evaluator.analyze_correctness( self.assigned_clean_by_device, self.is_clean_by_device) return conf_matrix_json
def detect_poison_partially_trusted(self, **kwargs) -> Dict[int, float]: """ Detect poison given trusted validation data :return: dictionary where keys are suspected poisonous device indices and values are performance differences """ self.set_params(**kwargs) if self.x_val is None or self.y_val is None: raise ValueError("Trusted data unavailable.") suspected = {} unfiltered_data = np.copy(self.x_train) unfiltered_labels = np.copy(self.y_train) segments = segment_by_class(self.x_train, self.p_train, self.num_devices) for device_idx, segment in enumerate(segments): filtered_data, filtered_labels = self.filter_input(unfiltered_data, unfiltered_labels, segment) unfiltered_model = deepcopy(self.classifier) filtered_model = deepcopy(self.classifier) unfiltered_model.fit(unfiltered_data, unfiltered_labels) filtered_model.fit(filtered_data, filtered_labels) var_w = performance_diff( filtered_model, unfiltered_model, self.x_val, self.y_val, perf_function=self.perf_func, ) if self.eps < var_w: suspected[device_idx] = var_w unfiltered_data = filtered_data unfiltered_labels = filtered_labels return suspected
def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str: """ If ground truth is known, this function returns a confusion matrix in the form of a JSON object. :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means x_train[i] is poisonous. :param kwargs: A dictionary of defence-specific parameters. :return: JSON object with confusion matrix. """ if is_clean is None or is_clean.size == 0: raise ValueError("is_clean was not provided while invoking evaluate_defence.") is_clean_by_class = segment_by_class(is_clean, self.y_train_sparse, self.classifier.nb_classes) _, predicted_clean = self.detect_poison() predicted_clean_by_class = segment_by_class(predicted_clean, self.y_train_sparse, self.classifier.nb_classes) _, conf_matrix_json = self.evaluator.analyze_correctness(predicted_clean_by_class, is_clean_by_class) return conf_matrix_json
def _segment_by_class(self, data: np.ndarray, features: np.ndarray) -> List[np.ndarray]: """ Returns segmented data according to specified features. :param data: Data to be segmented. :param features: Features used to segment data, e.g., segment according to predicted label or to `y_train`. :return: Segmented data according to specified features. """ n_classes = self.classifier.nb_classes return segment_by_class(data, features, n_classes)
def _segment_by_class(self, data, features): """ Returns segmented data according to specified features. :param data: to be segmented :type data: `np.ndarray` :param features: features used to segment data, e.g., segment according to predicted label or to `y_train` :type features: `np.ndarray` :return: segmented data according to specified features. :rtype: `list` """ n_classes = self.classifier.nb_classes() return segment_by_class(data, features, n_classes)