def predict(x): global num_queries num_queries += 1 print(f"classifier query {num_queries}") # Translate array to image x = np.array(x) y = np.reshape(x, (400, 400, 3)).astype(np.uint8) im = Image.fromarray(y) # Get the image in bytes img_byte_arr = io.BytesIO() im.save(img_byte_arr, format='PNG') bs = img_byte_arr.getvalue() # Querying google cloud service image = vision.Image(content=bs) resp = client.label_detection(image=image) labels = resp.label_annotations # Give label predictions prob1 = prob2 = prob3 = 0 for l in labels: if l.description == "Cat": prob1 = l.score elif l.description == "Wolf": prob2 = l.score if prob1 == 0 and prob2 == 0: return to_categorical([2], nb_classes=3) elif prob1 > prob2: return to_categorical([0], nb_classes=3) else: return to_categorical([1], nb_classes=3)
def predict(x): global num_queries num_queries += 1 print(f"classifier query {num_queries}") # Translate array to image x = np.array(x) y = np.reshape(x, SHAPE).astype(np.uint8) im = Image.fromarray(y) # Get the image in bytes img_byte_arr = io.BytesIO() im.save(img_byte_arr, format='PNG') bs = img_byte_arr.getvalue() # Querying google cloud service image = vision.Image(content=bs) import inspect print(inspect.getmembers(client)) print(client.__dict__) resp = client.label_detection(image=image, max_results=20) print(client.object_localization(image=image)) labels = resp.label_annotations # Give label predictions prob1 = prob2 = prob3 = 0 for l in labels: if l.description == "Human": prob1 = l.score elif l.description == "Machine": prob2 = l.score if prob1 == 0 and prob2 == 0: return to_categorical([2], nb_classes=3) elif prob1 > prob2: return to_categorical([0], nb_classes=3) else: return to_categorical([1], nb_classes=3)
def test_to_categorical(self): y = np.array([3, 1, 4, 1, 5, 9]) y_ = to_categorical(y) self.assertEqual(y_.shape, (6, 10)) self.assertTrue(np.all(y_.argmax(axis=1) == y)) self.assertTrue(np.all(np.logical_or(y_ == 0.0, y_ == 1.0))) y_ = to_categorical(y, 20) self.assertEqual(y_.shape, (6, 20))
def outlier_detection(self, x_val: np.ndarray, y_val: np.ndarray) -> List[Tuple[int, np.ndarray, np.ndarray]]: """ Returns a tuple of suspected of suspected poison labels and their mask and pattern :return: A list of tuples containing the the class index, mask, and pattern for suspected labels """ l1_norms = [] masks = [] patterns = [] num_classes = self.nb_classes for class_idx in range(num_classes): # Assuming classes are indexed target_label = to_categorical([class_idx], num_classes).flatten() mask, pattern = self.generate_backdoor(x_val, y_val, target_label) norm = np.sum(np.abs(mask)) l1_norms.append(norm) masks.append(mask) patterns.append(pattern) # assuming l1 norms would naturally create a normal distribution consistency_constant = 1.4826 median = np.median(l1_norms) mad = consistency_constant * np.median(np.abs(l1_norms - median)) # min_mad = np.abs(np.min(l1_norms) - median) / mad flagged_labels = [] for class_idx in range(num_classes): anomaly_index = np.abs(l1_norms[class_idx] - median) / mad # Points with anomaly_index > 2 have 95% probability of being an outlier # Backdoor outliers show up as masks with small l1 norms if l1_norms[class_idx] <= median and anomaly_index > 2: logger.warning("Detected potential backdoor in class: %s", str(class_idx)) flagged_labels.append(class_idx) return [(label, masks[label], patterns[label]) for label in flagged_labels]
def predict(self, x, **kwargs): """ Perform prediction for a batch of inputs. :param x: Test set. :type x: `np.ndarray` :return: Array of predictions of shape `(nb_inputs, nb_classes)`. :rtype: `np.ndarray` """ from xgboost import Booster, XGBClassifier from art.utils import to_categorical # Apply preprocessing x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False) if isinstance(self._model, Booster): from xgboost import DMatrix train_data = DMatrix(x_preprocessed, label=None) predictions = self._model.predict(train_data) y_prediction = np.asarray([line for line in predictions]) if len(y_prediction.shape) == 1: y_prediction = to_categorical(labels=y_prediction, nb_classes=self.nb_classes()) elif isinstance(self._model, XGBClassifier): y_prediction = self._model.predict_proba(x_preprocessed) # Apply postprocessing y_prediction = self._apply_postprocessing(preds=y_prediction, fit=False) return y_prediction
def _predict_rf(x): """Wrapper to query blackbox for rf""" proba_rf = model_kdf.rf_model.predict_proba(x) predicted_label_rf = np.argmax(proba_rf, axis=1) return to_categorical(predicted_label_rf, nb_classes=len( np.unique(y[indx_to_take_train])))
def poison_dataset(x_clean, y_clean, poison_func): x_poison = np.copy(x_clean) y_poison = np.copy(y_clean) is_poison = np.zeros(np.shape(y_poison)[0]) for i in range(10): src = i tgt = (i + 1) % 10 n_points_in_tgt = np.round( np.sum(np.argmax(y_clean, axis=1) == tgt)) num_poison = int((PP_POISON * n_points_in_tgt) / (1 - PP_POISON)) src_imgs = np.copy(x_clean[np.argmax(y_clean, axis=1) == src]) n_points_in_src = np.shape(src_imgs)[0] if num_poison: indices_to_be_poisoned = np.random.choice( n_points_in_src, num_poison) imgs_to_be_poisoned = src_imgs[indices_to_be_poisoned] backdoor_attack = PoisoningAttackBackdoor(poison_func) poison_images, poison_labels = backdoor_attack.poison( imgs_to_be_poisoned, y=to_categorical(np.ones(num_poison) * tgt, 10)) x_poison = np.append(x_poison, poison_images, axis=0) y_poison = np.append(y_poison, poison_labels, axis=0) is_poison = np.append(is_poison, np.ones(num_poison)) is_poison = is_poison != 0 return is_poison, x_poison, y_poison
def test_keras_mnist(self): (x_train, y_train), (x_test, y_test) = self.mnist x_test_original = x_test.copy() # Keras classifier classifier = get_classifier_kr() scores = classifier._model.evaluate(x_train, y_train) logger.info('[Keras, MNIST] Accuracy on training set: %.2f%%', (scores[1] * 100)) scores = classifier._model.evaluate(x_test, y_test) logger.info('[Keras, MNIST] Accuracy on test set: %.2f%%', (scores[1] * 100)) # targeted # Generate random target classes nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0] targets = np.random.randint(nb_classes, size=NB_TEST) while (targets == np.argmax(y_test, axis=1)).any(): targets = np.random.randint(nb_classes, size=NB_TEST) # Perform attack df = SaliencyMapMethod(classifier, theta=1, batch_size=100) x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes)) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) accuracy = np.sum( np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial examples: %.2f%%', (accuracy * 100)) # untargeted df = SaliencyMapMethod(classifier, theta=1, batch_size=100) x_test_adv = df.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) accuracy = np.sum( np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial examples: %.2f%%', (accuracy * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def _test_mnist_targeted(self, classifier): # Get MNIST (_, _), (x_test, y_test) = self.mnist x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Generate random target classes import numpy as np nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0] targets = np.random.randint(nb_classes, size=NB_TEST) while (targets == np.argmax(y_test, axis=1)).any(): targets = np.random.randint(nb_classes, size=NB_TEST) # Perform attack df = SaliencyMapMethod(classifier, theta=1) x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes)) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] print('\nAccuracy on adversarial examples: %.2f%%' % (acc * 100))
def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: """ Perform prediction for a batch of inputs. :param x: Input samples. :return: Array of predictions of shape `(nb_inputs, nb_classes)`. """ import xgboost # lgtm [py/repeated-import] lgtm [py/import-and-import-from] # Apply preprocessing x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False) if isinstance(self._model, xgboost.Booster): train_data = xgboost.DMatrix(x_preprocessed, label=None) y_prediction = self._model.predict(train_data) if len(y_prediction.shape) == 1: y_prediction = to_categorical(labels=y_prediction, nb_classes=self.nb_classes) elif isinstance(self._model, xgboost.XGBClassifier): y_prediction = self._model.predict_proba(x_preprocessed) # Apply postprocessing y_prediction = self._apply_postprocessing(preds=y_prediction, fit=False) return y_prediction
def predict(x): out_label = [] for x_i in x: # save image as intermediate png imageio.imsave('tmp.png', x_i) # run tesseract status = os.system("tesseract tmp.png out") if status != 0: raise Exception('Tesseract failed to run.') # read text file = open("out.txt","r+") test = file.read() out_string = test.strip() # convert to categorical if out_string == 'dissent': out_label.append(0) elif out_string == 'assent': out_label.append(1) else: out_label.append(2) return to_categorical(out_label, 3)
def test_keras_mnist(self): """ Second test with the KerasClassifier. :return: """ (_, _), (x_test, y_test) = self.mnist x_test_original = x_test.copy() # Build KerasClassifier krc = get_classifier_kr() # First attack ead = ElasticNet(classifier=krc, targeted=True, max_iter=2) y_target = to_categorical(np.asarray([6, 6, 7, 4, 9, 7, 9, 0, 1, 0]), nb_classes=10) x_test_adv = ead.generate(x_test, y=y_target) expected_x_test_adv = np.asarray([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00183569, 0.0, 0.0, 0.49765405, 1., 0.6467149, 0.0033755, 0.0052456, 0.0, 0.01104407, 0.00495547, 0.02747423, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv, decimal=6) self.assertLessEqual(np.amax(x_test_adv), 1.0) self.assertGreaterEqual(np.amin(x_test_adv), 0.0) target = np.argmax(y_target, axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) logger.debug('EAD target: %s', target) logger.debug('EAD actual: %s', y_pred_adv) logger.info('EAD success rate: %.2f%%', (100 * sum(target == y_pred_adv) / float(len(target)))) self.assertTrue((target == y_pred_adv).any()) # Second attack ead = ElasticNet(classifier=krc, targeted=False, max_iter=2) y_target = to_categorical(np.asarray([9, 5, 6, 7, 1, 6, 1, 5, 8, 5]), nb_classes=10) x_test_adv = ead.generate(x_test, y=y_target) self.assertLessEqual(np.amax(x_test_adv), 1.0) self.assertGreaterEqual(np.amin(x_test_adv), 0.0) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) logger.debug('EAD target: %s', y_target) logger.debug('EAD actual: %s', y_pred_adv) logger.info('EAD success rate: %.2f', (100 * sum(target != y_pred_adv) / float(len(target)))) self.assertTrue((target != y_pred_adv).any()) np.testing.assert_array_equal(y_pred_adv, np.asarray([7, 1, 1, 4, 4, 1, 4, 4, 4, 4])) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) k.clear_session()
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). :type y: `np.ndarray` :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`. :type x_adv_init: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Get clip_min and clip_max from the classifier or infer them from data if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values else: clip_min, clip_max = np.min(x), np.max(x) # Prediction from the original images preds = np.argmax(self.classifier.predict(x, batch_size=self.batch_size), axis=1) # Prediction from the initial adversarial examples if not None x_adv_init = kwargs.get('x_adv_init') if x_adv_init is not None: init_preds = np.argmax(self.classifier.predict(x_adv_init, batch_size=self.batch_size), axis=1) else: init_preds = [None] * len(x) x_adv_init = [None] * len(x) # Assert that, if attack is targeted, y is provided if self.targeted and y is None: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # Some initial setups x_adv = x.astype(NUMPY_DTYPE) if y is not None: y = np.argmax(y, axis=1) # Generate the adversarial samples for ind, val in enumerate(x_adv): if self.targeted: x_adv[ind] = self._perturb(x=val, y=y[ind], y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) else: x_adv[ind] = self._perturb(x=val, y=-1, y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) if y is not None: y = to_categorical(y, self.classifier.nb_classes()) logger.info('Success rate of HopSkipJump attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size)) return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in a Numpy array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: An array with the original labels to be predicted. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ x_adv = x.astype(NUMPY_DTYPE) # Initialize variables y_pred = self.classifier.predict(x, logits=False, batch_size=self.batch_size) pred_class = np.argmax(y_pred, axis=1) # Compute perturbation with implicit batching for batch_id in range(int(np.ceil(x_adv.shape[0] / float(self.batch_size)))): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] # Main algorithm for each batch norm_batch = np.linalg.norm(np.reshape(batch, (batch.shape[0], -1)), axis=1) l_batch = pred_class[batch_index_1:batch_index_2] l_b = to_categorical(l_batch, self.classifier.nb_classes).astype(bool) # Main loop of the algorithm for _ in range(self.max_iter): # Compute score score = self.classifier.predict(batch, logits=False)[l_b] # Compute the gradients and norm grads = self.classifier.class_gradient(batch, label=l_batch, logits=False) grads = np.squeeze(grads, axis=1) norm_grad = np.linalg.norm(np.reshape(grads, (batch.shape[0], -1)), axis=1) # Theta theta = self._compute_theta(norm_batch, score, norm_grad) # Pertubation di_batch = self._compute_pert(theta, grads, norm_grad) # Update xi and pertubation batch += di_batch # Apply clip if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values x_adv[batch_index_1:batch_index_2] = np.clip(batch, clip_min, clip_max) else: x_adv[batch_index_1:batch_index_2] = batch logger.info('Success rate of NewtonFool attack: %.2f%%', (np.sum(np.argmax(self.classifier.predict(x, batch_size=self.batch_size), axis=1) != np.argmax( self.classifier.predict(x_adv, batch_size=self.batch_size), axis=1)) / x.shape[0])) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in a Numpy array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) # Initialize variables y_pred = self.estimator.predict(x, batch_size=self.batch_size) pred_class = np.argmax(y_pred, axis=1) # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="NewtonFool", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] # Main algorithm for each batch norm_batch = np.linalg.norm(np.reshape(batch, (batch.shape[0], -1)), axis=1) l_batch = pred_class[batch_index_1:batch_index_2] l_b = to_categorical(l_batch, self.estimator.nb_classes).astype(bool) # Main loop of the algorithm for _ in range(self.max_iter): # Compute score score = self.estimator.predict(batch)[l_b] # Compute the gradients and norm grads = self.estimator.class_gradient(batch, label=l_batch) if grads.shape[1] == 1: grads = np.squeeze(grads, axis=1) norm_grad = np.linalg.norm(np.reshape(grads, (batch.shape[0], -1)), axis=1) # Theta theta = self._compute_theta(norm_batch, score, norm_grad) # Perturbation di_batch = self._compute_pert(theta, grads, norm_grad) # Update xi and perturbation batch += di_batch # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_adv[batch_index_1:batch_index_2] = np.clip(batch, clip_min, clip_max) else: x_adv[batch_index_1:batch_index_2] = batch logger.info( "Success rate of NewtonFool attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
def test_random_targets(self): y = np.array([3, 1, 4, 1, 5, 9]) y_ = to_categorical(y) random_y = random_targets(y, 10) self.assertTrue(np.all(y != random_y.argmax(axis=1))) random_y = random_targets(y_, 10) self.assertTrue(np.all(y != random_y.argmax(axis=1)))
def test_failure_modes(art_warning, get_default_mnist_subset, image_dl_estimator, params): try: (x_train, y_train), (_, _) = get_default_mnist_subset classifier, _ = image_dl_estimator() target = to_categorical([9], 10)[0] backdoor = PoisoningAttackBackdoor(add_pattern_bd) with pytest.raises(ValueError): attack = PoisoningAttackCleanLabelBackdoor(backdoor, classifier, target, **params) except ARTTestException as e: art_warning(e)
def test_get_labels_np_array(self): y = np.array([3, 1, 4, 1, 5, 9]) y_ = to_categorical(y) logits = np.random.normal(1 * y_, scale=0.1) ps = (np.exp(logits).T / np.sum(np.exp(logits), axis=1)).T labels = get_labels_np_array(ps) self.assertEqual(labels.shape, y_.shape) self.assertTrue(np.all(labels == y_))
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. `x` is expected to have spatial dimensions. :type x: `np.ndarray` :param y: An array with the original labels to be predicted. :type y: `np.ndarray` :return: An array holding the adversarial patch. :rtype: `np.ndarray` """ logger.info('Creating adversarial patch.') if len(x.shape) == 2: raise ValueError( 'Feature vectors detected. The adversarial patch can only be applied to data with spatial ' 'dimensions.') self.patch = (np.random.standard_normal(size=self.patch_shape)) * 20.0 for i_step in range(self.max_iter): if i_step == 0 or (i_step + 1) % 100 == 0: logger.info('Training Step: %i', i_step + 1) if self.clip_patch is not None: for i_channel, (a_min, a_max) in enumerate(self.clip_patch): self.patch[:, :, i_channel] = np.clip(self.patch[:, :, i_channel], a_min=a_min, a_max=a_max) patched_images, patch_mask_transformed, transforms = self._augment_images_with_random_patch( x, self.patch) gradients = self.classifier.loss_gradient( patched_images, to_categorical( np.broadcast_to(np.array(self.target), x.shape[0]), self.classifier.nb_classes)) patch_gradients = np.zeros_like(self.patch) for i_batch in range(self.batch_size): patch_gradients_i = self._reverse_transformation( gradients[i_batch, :, :, :], patch_mask_transformed[i_batch, :, :, :], transforms[i_batch]) patch_gradients += patch_gradients_i patch_gradients = patch_gradients / self.batch_size self.patch -= patch_gradients * self.learning_rate return self.patch, self._get_circular_patch_mask()
def _query_label(self, x: np.ndarray) -> np.ndarray: """ Query the victim classifier. :param x: An array with the source input to the victim classifier. :return: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)`. """ labels = self.estimator.predict(x=x, batch_size=self.batch_size_query) labels = np.argmax(labels, axis=1) labels = to_categorical(labels=labels, nb_classes=self.estimator.nb_classes) return labels
def test_poison(art_warning, get_default_mnist_subset, image_dl_estimator): try: (x_train, y_train), (_, _) = get_default_mnist_subset classifier, _ = image_dl_estimator() target = to_categorical([9], 10)[0] backdoor = PoisoningAttackBackdoor(add_pattern_bd) attack = PoisoningAttackCleanLabelBackdoor(backdoor, classifier, target) poison_data, poison_labels = attack.poison(x_train, y_train) np.testing.assert_equal(poison_data.shape, x_train.shape) np.testing.assert_equal(poison_labels.shape, y_train.shape) except ARTTestException as e: art_warning(e)
def test_get_label_conf(self): y = np.array([3, 1, 4, 1, 5, 9]) y_ = to_categorical(y) logits = np.random.normal(10 * y_, scale=0.1) ps = (np.exp(logits).T / np.sum(np.exp(logits), axis=1)).T c, l = get_label_conf(ps) self.assertEqual(c.shape, y.shape) self.assertEqual(l.shape, y.shape) self.assertTrue(np.all(l == y)) self.assertTrue(np.allclose(c, 0.99, atol=1e-2))
def test_segment_by_class(self): data = np.array([[3, 2], [9, 2], [4, 0], [9, 0]]) classes = to_categorical(np.array([2, 1, 0, 1])) num_classes = 3 segments = segment_by_class(data, classes, num_classes) self.assertEqual(len(segments), num_classes) self.assertEqual(len(segments[1]), 2) self.assertTrue(np.all(np.equal(segments[0], np.array([data[2]])))) self.assertTrue(np.all(np.equal(segments[1], np.array([data[1], data[3]])))) self.assertTrue(np.all(np.equal(segments[2], np.array([data[0]])))) num_classes = 4 segments = segment_by_class(data, classes, num_classes) self.assertEqual(len(segments), num_classes)
def poison_loader_clbd(**kwargs): backdoor_kwargs = kwargs.pop("backdoor_kwargs") backdoor = poison_loader_GTSRB(**backdoor_kwargs) # Targets is a one-hot numpy array -- need to map from sparse representation target = kwargs.pop("target") n_classes = kwargs.pop("n_classes") targets = to_categorical([target], n_classes)[0] return ( PoisoningAttackCleanLabelBackdoor(backdoor=backdoor, target=targets, **kwargs), backdoor, )
def _perchannel(self, x: np.ndarray) -> np.ndarray: """ Apply thermometer encoding to one channel. :param x: Sample to encode with shape `(batch_size, width, height)`. :return: Encoded sample with shape `(batch_size, width, height, num_space)`. """ pos = np.zeros(shape=x.shape) for i in range(1, self.num_space): pos[x > float(i) / self.num_space] += 1 onehot_rep = to_categorical(pos.reshape(-1), self.num_space) for i in range(self.num_space - 1): onehot_rep[:, i] += np.sum(onehot_rep[:, i + 1:], axis=1) return onehot_rep.flatten()
def _test_mnist_targeted(self, classifier): # Get MNIST (_, _), (x_test, y_test) = self.mnist x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Generate random target classes nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0] targets = np.random.randint(nb_classes, size=NB_TEST) while (targets == np.argmax(y_test, axis=1)).any(): targets = np.random.randint(nb_classes, size=NB_TEST) # Perform attack # import time df = SaliencyMapMethod(classifier, theta=1) # starttime = time.clock() # x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes), batch_size=1) # endtime = time.clock() # print(1, endtime - starttime) # # starttime = time.clock() # x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes), batch_size=10) # endtime = time.clock() # print(10, endtime - starttime) # # starttime = time.clock() x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes), batch_size=100) # endtime = time.clock() # print(100, endtime - starttime) # starttime = time.clock() # x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes), batch_size=1000) # endtime = time.clock() # print(1000, endtime - starttime) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial examples: %.2f%%', (acc * 100))
def test_keras_mnist_L2(self): """ Second test with the KerasClassifier. :return: """ (_, _), (x_test, y_test) = self.mnist x_test_original = x_test.copy() # Build KerasClassifier krc = get_classifier_kr(from_logits=True) # First attack cl2m = CarliniL2Method(classifier=krc, targeted=True, max_iter=10) y_target = [6, 6, 7, 4, 9, 7, 9, 0, 1, 0] x_test_adv = cl2m.generate(x_test, y=to_categorical(y_target, nb_classes=10)) self.assertFalse((x_test == x_test_adv).all()) self.assertLessEqual(np.amax(x_test_adv), 1.0) self.assertGreaterEqual(np.amin(x_test_adv), 0.0) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) logger.debug('CW2 Target: %s', y_target) logger.debug('CW2 Actual: %s', y_pred_adv) logger.info('CW2 Success Rate: %.2f', (np.sum(y_target == y_pred_adv) / float(len(y_target)))) self.assertTrue((y_target == y_pred_adv).any()) # Second attack cl2m = CarliniL2Method(classifier=krc, targeted=False, max_iter=10) x_test_adv = cl2m.generate(x_test) self.assertLessEqual(np.amax(x_test_adv), 1.0) self.assertGreaterEqual(np.amin(x_test_adv), 0.0) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) logger.debug('CW2 Target: %s', y_target) logger.debug('CW2 Actual: %s', y_pred_adv) logger.info('CW2 Success Rate: %.2f', (np.sum(y_target != y_pred_adv) / float(len(y_target)))) self.assertTrue((y_target != y_pred_adv).any()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) # Clean-up k.clear_session()
def generate(self, x, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :return: An array holding the adversarial patch. :rtype: `np.ndarray` """ logger.info('Creating adversarial patch.') self.set_params(**kwargs) self.patch = (np.random.standard_normal(size=self.patch_shape)) * 20.0 for i_step in range(self.max_iter): if i_step == 0 or (i_step + 1) % 100 == 0: logger.info('Training Step: %i', i_step + 1) if self.clip_patch is not None: for i_channel, (a_min, a_max) in enumerate(self.clip_patch): self.patch[:, :, i_channel] = np.clip(self.patch[:, :, i_channel], a_min=a_min, a_max=a_max) patched_images, patch_mask_transformed, transforms = self._augment_images_with_random_patch( x, self.patch) gradients = self.classifier.loss_gradient( patched_images, to_categorical( np.broadcast_to(np.array(self.target), x.shape[0]), self.classifier.nb_classes)) patch_gradients = np.zeros_like(self.patch) for i_batch in range(self.batch_size): patch_gradients_i = self._reverse_transformation( gradients[i_batch, :, :, :], patch_mask_transformed[i_batch, :, :, :], transforms[i_batch]) patch_gradients += patch_gradients_i patch_gradients = patch_gradients / self.batch_size self.patch -= patch_gradients * self.learning_rate return self.patch, self._get_circular_patch_mask()
def predict(self, x, **kwargs): """ Perform prediction for a batch of inputs. :param x: Test set. :type x: `np.ndarray` :return: Array of predictions of shape `(nb_inputs, nb_classes)`. :rtype: `np.ndarray` """ # Apply defences x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False) if hasattr(self._model, 'predict_proba') and callable(getattr(self._model, 'predict_proba')): y_pred = self._model.predict_proba(x_preprocessed) elif hasattr(self._model, 'predict') and callable(getattr(self._model, 'predict')): y_pred = to_categorical(self._model.predict(x_preprocessed), nb_classes=self._model.classes_.shape[0]) else: raise ValueError('The provided model does not have methods `predict_proba` or `predict`.') return y_pred
def _perchannel(self, x): """ Apply thermometer encoding to one channel. :param x: Sample to encode with shape `(batch_size, width, height)`. :type x: `np.ndarray` :return: Encoded sample with shape `(batch_size, width, height, num_space)`. :rtype: `np.ndarray` """ pos = np.zeros(shape=x.shape) for i in range(1, self.num_space): pos[x > float(i) / self.num_space] += 1 onehot_rep = to_categorical(pos.reshape(-1), self.num_space) for i in reversed(range(1, self.num_space)): onehot_rep[:, i] += np.sum(onehot_rep[:, :i], axis=1) result = onehot_rep.reshape(list(x.shape) + [self.num_space]) return result