def _test_mnist_targeted(self, classifier): # Get MNIST (_, _), (x_test, _) = self.mnist # Test FGSM with np.inf norm attack = FastGradientMethod(classifier, eps=1.0, targeted=True) pred_sort = classifier.predict(x_test).argsort(axis=1) y_test_adv = np.zeros((x_test.shape[0], 10)) for i in range(x_test.shape[0]): y_test_adv[i, pred_sort[i, -2]] = 1.0 attack_params = {"minimal": True, "eps_step": 0.01, "eps": 1.0} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test, y=y_test_adv) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertEqual(y_test_adv.shape, test_y_pred.shape) self.assertGreaterEqual((y_test_adv == test_y_pred).sum(), x_test.shape[0] // 2)
def _test_mnist_targeted(self, classifier): # Get MNIST (_, _), (x_test, y_test) = self.mnist x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Generate random target classes nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0] targets = np.random.randint(nb_classes, size=NB_TEST) while (targets == np.argmax(y_test, axis=1)).any(): targets = np.random.randint(nb_classes, size=NB_TEST) # Perform attack df = SaliencyMapMethod(classifier, theta=1, batch_size=100) x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes)) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial examples: %.2f%%', (acc * 100))
def fit(self, x, y, **kwargs): """ Train a model adversarially. Each attack specified when creating the AdversarialTrainer is applied to all samples in the dataset, and only the successful ones (on the source model) are kept for data augmentation. :param x: Training set :type x: `np.ndarray` :param y: Labels :type y: `np.ndarray` :param kwargs: Dictionary of parameters to be passed on to the `fit` method of the classifier :type kwargs: `dict` :return: `None` """ x_augmented = list(x.copy()) y_augmented = list(y.copy()) # Generate adversarial samples for each attack for i, attack in enumerate(self.attacks): # Fit the classifier to be used for the attack # TODO Do not refit classifier if already fitted attack.classifier.fit(x, y, **kwargs) # Predict new labels for the adversarial samples generated x_adv = attack.generate(x, **self.attacks[attack]) y_pred = get_labels_np_array(attack.classifier.predict(x_adv)) x_adv = x_adv[np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)] y_adv = y_pred[np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)] # Only add successful attacks to augmented dataset x_augmented.extend(list(x_adv)) y_augmented.extend(list(y_adv)) # Fit the model with the extended dataset self.classifier.fit(np.array(x_augmented), np.array(y_augmented), **kwargs) self.x = x_augmented self.y = y_augmented
def _test_mnist_untargeted(self, classifier): # Get MNIST (_, _), (x_test, y_test) = self.mnist x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # import time df = SaliencyMapMethod(classifier, theta=1) # starttime = time.clock() # x_test_adv = df.generate(x_test, batch_size=1) # endtime = time.clock() # print(1, endtime - starttime) # # starttime = time.clock() # x_test_adv = df.generate(x_test, batch_size=10) # endtime = time.clock() # print(10, endtime - starttime) # # starttime = time.clock() x_test_adv = df.generate(x_test, batch_size=100) # endtime = time.clock() # print(100, endtime - starttime) # starttime = time.clock() # x_test_adv = df.generate(x_test, batch_size=1000) # endtime = time.clock() # print(1000, endtime - starttime) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0. == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial examples: %.2f%%', (acc * 100))
def _set_targets(self, x, y, classifier_mixin=True): """ Check and set up targets. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :type y: `np.ndarray` :param classifier_mixin: Whether the estimator is of type `ClassifierMixin`. :type classifier_mixin: `bool` :return: The targets. :rtype: `np.ndarray` """ if classifier_mixin: y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs if classifier_mixin: targets = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) else: targets = self.estimator.predict(x, batch_size=self.batch_size) else: targets = y return targets
def _test_mnist_targeted(self, classifier): # Get MNIST (_, _), (x_test, _) = self.mnist # Test FGSM with np.inf norm attack = BasicIterativeMethod(classifier, eps=1.0, eps_step=0.01, targeted=True) # y_test_adv = to_categorical((np.argmax(y_test, axis=1) + 1) % 10, 10) pred_sort = classifier.predict(x_test).argsort(axis=1) y_test_adv = np.zeros((x_test.shape[0], 10)) for i in range(x_test.shape[0]): y_test_adv[i, pred_sort[i, -2]] = 1.0 x_test_adv = attack.generate(x_test, y=y_test_adv) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertEqual(y_test_adv.shape, test_y_pred.shape) # This doesn't work all the time, especially with small networks self.assertTrue( (y_test_adv == test_y_pred).sum() >= x_test.shape[0] // 2)
def _test_backend_mnist(self, classifier, x_train, y_train, x_test, y_test): x_test_original = x_test.copy() # Test PGD with np.inf norm attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info("Accuracy on adversarial train examples: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100) # Test PGD with 3 random initialisations attack = ProjectedGradientDescent(classifier, num_random_init=3) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info( "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info( "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%", acc * 100) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) # Test the masking attack = ProjectedGradientDescent(classifier, num_random_init=1) mask = np.random.binomial(n=1, p=0.5, size=np.prod(x_test.shape)) mask = mask.reshape(x_test.shape).astype(np.float32) x_test_adv = attack.generate(x_test, mask=mask) mask_diff = (1 - mask) * (x_test_adv - x_test) self.assertAlmostEqual(float(np.max(np.abs(mask_diff))), 0.0, delta=0.00001) # Test eps of array type 1 attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1) eps = np.ones(shape=x_test.shape) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 2 eps = np.ones(shape=x_test.shape[1:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 3 eps = np.ones(shape=x_test.shape[2:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 4 eps = np.ones(shape=x_test.shape[3:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all())
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`. :type x_adv_init: `np.ndarray` :return: An array holding the adversarial examples. """ if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs y = get_labels_np_array( self.estimator.predict( x, batch_size=self.batch_size)) # type: ignore y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=False) if y is not None and self.estimator.nb_classes == 2 and y.shape[1] == 1: raise ValueError( # pragma: no cover "This attack has not yet been tested for binary classification with a single output classifier." ) # Get clip_min and clip_max from the classifier or infer them from data if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values else: clip_min, clip_max = np.min(x), np.max(x) # Prediction from the original images preds = np.argmax(self.estimator.predict(x, batch_size=self.batch_size), axis=1) # Prediction from the initial adversarial examples if not None x_adv_init = kwargs.get("x_adv_init") if x_adv_init is not None: init_preds = np.argmax(self.estimator.predict( x_adv_init, batch_size=self.batch_size), axis=1) else: init_preds = [None] * len(x) x_adv_init = [None] * len(x) # Assert that, if attack is targeted, y is provided if self.targeted and y is None: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # Some initial setups x_adv = x.astype(ART_NUMPY_DTYPE) # Generate the adversarial samples for ind, val in enumerate( tqdm(x_adv, desc="Boundary attack", disable=not self.verbose)): if self.targeted: x_adv[ind] = self._perturb( x=val, y=y[ind], y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max, ) else: x_adv[ind] = self._perturb( x=val, y=-1, y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max, ) y = to_categorical(y, self.estimator.nb_classes) logger.info( "Success rate of Boundary attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def _test_backend_mnist(self, classifier, x_train, y_train, x_test, y_test): x_test_original = x_test.copy() # Test PGD with np.inf norm attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / len(y_train) logger.info("Accuracy on adversarial train examples: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100) # Test PGD with 3 random initialisations attack = ProjectedGradientDescent(classifier, num_random_init=3, max_iter=5) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / len(y_train) logger.info( "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info( "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%", acc * 100) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(np.array(x_test_original) - np.array(x_test)))), 0.0, delta=0.00001)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :return: An array holding the adversarial examples. """ if x.ndim != 4: raise ValueError( "Unrecognized input dimension. Attack can only be applied to image data." ) x_adv = x.astype(ART_NUMPY_DTYPE) y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Use model predictions as true labels logger.info("Using model predictions as true labels.") y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.channels_first: channels = x.shape[1] height = x.shape[2] width = x.shape[3] else: height = x.shape[1] width = x.shape[2] channels = x.shape[3] for _ in trange(self.nb_restarts, desc="SquareAttack - restarts", disable=not self.verbose): # Determine correctly predicted samples y_pred = self.estimator.predict(x_adv, batch_size=self.batch_size) sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) if np.sum(sample_is_robust) == 0: break # x_robust = x_adv[sample_is_robust] x_robust = x[sample_is_robust] y_robust = y[sample_is_robust] sample_logits_diff_init = self._get_logits_diff(x_robust, y_robust) if self.norm in [np.inf, "inf"]: if self.estimator.channels_first: size = (x_robust.shape[0], channels, 1, width) else: size = (x_robust.shape[0], 1, width, channels) # Add vertical stripe perturbations x_robust_new = np.clip( x_robust + self.eps * np.random.choice([-1, 1], size=size), a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1], ).astype(ART_NUMPY_DTYPE) sample_logits_diff_new = self._get_logits_diff( x_robust_new, y_robust) logits_diff_improved = (sample_logits_diff_new - sample_logits_diff_init) < 0.0 x_robust[logits_diff_improved] = x_robust_new[ logits_diff_improved] x_adv[sample_is_robust] = x_robust for i_iter in trange(self.max_iter, desc="SquareAttack - iterations", leave=False, disable=not self.verbose): percentage_of_elements = self._get_percentage_of_elements( i_iter) # Determine correctly predicted samples y_pred = self.estimator.predict(x_adv, batch_size=self.batch_size) sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) if np.sum(sample_is_robust) == 0: break x_robust = x_adv[sample_is_robust] x_init = x[sample_is_robust] y_robust = y[sample_is_robust] sample_logits_diff_init = self._get_logits_diff( x_robust, y_robust) height_tile = max( int( round( math.sqrt(percentage_of_elements * height * width))), 1) height_mid = np.random.randint(0, height - height_tile) width_start = np.random.randint(0, width - height_tile) delta_new = np.zeros(self.estimator.input_shape) if self.estimator.channels_first: delta_new[:, height_mid:height_mid + height_tile, width_start:width_start + height_tile] = np.random.choice( [-2 * self.eps, 2 * self.eps], size=[channels, 1, 1]) else: delta_new[height_mid:height_mid + height_tile, width_start:width_start + height_tile, :] = np.random.choice( [-2 * self.eps, 2 * self.eps], size=[1, 1, channels]) x_robust_new = x_robust + delta_new x_robust_new = np.minimum( np.maximum(x_robust_new, x_init - self.eps), x_init + self.eps) x_robust_new = np.clip( x_robust_new, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1]).astype( ART_NUMPY_DTYPE) sample_logits_diff_new = self._get_logits_diff( x_robust_new, y_robust) logits_diff_improved = (sample_logits_diff_new - sample_logits_diff_init) < 0.0 x_robust[logits_diff_improved] = x_robust_new[ logits_diff_improved] x_adv[sample_is_robust] = x_robust elif self.norm == 2: n_tiles = 5 height_tile = height // n_tiles def _get_perturbation(h): delta = np.zeros([h, h]) gaussian_perturbation = np.zeros([h // 2, h]) x_c = h // 4 y_c = h // 2 for i_y in range(y_c): gaussian_perturbation[ max(x_c, 0):min(x_c + (2 * i_y + 1), h // 2), max(0, y_c):min(y_c + (2 * i_y + 1), h)] += 1.0 / ( (i_y + 1)**2) x_c -= 1 y_c -= 1 gaussian_perturbation /= np.sqrt( np.sum(gaussian_perturbation**2)) delta[:h // 2] = gaussian_perturbation delta[h // 2:h // 2 + gaussian_perturbation. shape[0]] = -gaussian_perturbation delta /= np.sqrt(np.sum(delta**2)) if random.random() > 0.5: delta = np.transpose(delta) if random.random() > 0.5: delta = -delta return delta delta_init = np.zeros(x_robust.shape, dtype=ART_NUMPY_DTYPE) height_start = 0 for _ in range(n_tiles): width_start = 0 for _ in range(n_tiles): if self.estimator.channels_first: perturbation_size = (1, 1, height_tile, height_tile) random_size = (x_robust.shape[0], channels, 1, 1) else: perturbation_size = (1, height_tile, height_tile, 1) random_size = (x_robust.shape[0], 1, 1, channels) perturbation = _get_perturbation(height_tile).reshape( perturbation_size) * np.random.choice( [-1, 1], size=random_size) if self.estimator.channels_first: delta_init[:, :, height_start:height_start + height_tile, width_start:width_start + height_tile] += perturbation else: delta_init[:, height_start:height_start + height_tile, width_start:width_start + height_tile, :] += perturbation width_start += height_tile height_start += height_tile x_robust_new = np.clip( x_robust + delta_init / np.sqrt( np.sum(delta_init**2, axis=(1, 2, 3), keepdims=True)) * self.eps, self.estimator.clip_values[0], self.estimator.clip_values[1], ) sample_logits_diff_new = self._get_logits_diff( x_robust_new, y_robust) logits_diff_improved = (sample_logits_diff_new - sample_logits_diff_init) < 0.0 x_robust[logits_diff_improved] = x_robust_new[ logits_diff_improved] x_adv[sample_is_robust] = x_robust for i_iter in trange(self.max_iter, desc="SquareAttack - iterations", leave=False, disable=not self.verbose): percentage_of_elements = self._get_percentage_of_elements( i_iter) # Determine correctly predicted samples y_pred = self.estimator.predict(x_adv, batch_size=self.batch_size) sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) if np.sum(sample_is_robust) == 0: break x_robust = x_adv[sample_is_robust] x_init = x[sample_is_robust] y_robust = y[sample_is_robust] sample_logits_diff_init = self._get_logits_diff( x_robust, y_robust) delta_x_robust_init = x_robust - x_init height_tile = max( int( round( math.sqrt(percentage_of_elements * height * width))), 3) if height_tile % 2 == 0: height_tile += 1 height_tile_2 = height_tile height_start = np.random.randint(0, height - height_tile) width_start = np.random.randint(0, width - height_tile) new_deltas_mask = np.zeros(x_init.shape) if self.estimator.channels_first: new_deltas_mask[:, :, height_start:height_start + height_tile, width_start:width_start + height_tile] = 1.0 w_1_norm = np.sqrt( np.sum( delta_x_robust_init[:, :, height_start:height_start + height_tile, width_start:width_start + height_tile, ]**2, axis=(2, 3), keepdims=True, )) else: new_deltas_mask[:, height_start:height_start + height_tile, width_start:width_start + height_tile, :] = 1.0 w_1_norm = np.sqrt( np.sum( delta_x_robust_init[:, height_start:height_start + height_tile, width_start:width_start + height_tile, :, ]**2, axis=(1, 2), keepdims=True, )) height_2_start = np.random.randint(0, height - height_tile_2) width_2_start = np.random.randint(0, width - height_tile_2) new_deltas_mask_2 = np.zeros(x_init.shape) if self.estimator.channels_first: new_deltas_mask_2[:, :, height_2_start:height_2_start + height_tile_2, width_2_start:width_2_start + height_tile_2, ] = 1.0 else: new_deltas_mask_2[:, height_2_start:height_2_start + height_tile_2, width_2_start:width_2_start + height_tile_2, :, ] = 1.0 norms_x_robust = np.sqrt( np.sum((x_robust - x_init)**2, axis=(1, 2, 3), keepdims=True)) w_norm = np.sqrt( np.sum( (delta_x_robust_init * np.maximum( new_deltas_mask, new_deltas_mask_2))**2, axis=(1, 2, 3), keepdims=True, )) if self.estimator.channels_first: new_deltas_size = [ x_init.shape[0], channels, height_tile, height_tile ] random_choice_size = [x_init.shape[0], channels, 1, 1] perturbation_size = [1, 1, height_tile, height_tile] else: new_deltas_size = [ x_init.shape[0], height_tile, height_tile, channels ] random_choice_size = [x_init.shape[0], 1, 1, channels] perturbation_size = [1, height_tile, height_tile, 1] delta_new = ( np.ones(new_deltas_size) * _get_perturbation( height_tile).reshape(perturbation_size) * np.random.choice([-1, 1], size=random_choice_size)) if self.estimator.channels_first: delta_new += delta_x_robust_init[:, :, height_start: height_start + height_tile, width_start: width_start + height_tile] / ( np.maximum( 1e-9, w_1_norm)) else: delta_new += delta_x_robust_init[:, height_start: height_start + height_tile, width_start: width_start + height_tile, :] / ( np.maximum( 1e-9, w_1_norm)) diff_norm = (self.eps * np.ones( delta_new.shape))**2 - norms_x_robust**2 diff_norm[diff_norm < 0.0] = 0.0 if self.estimator.channels_first: delta_new /= np.sqrt( np.sum(delta_new**2, axis=(2, 3), keepdims=True) ) * np.sqrt(diff_norm / channels + w_norm**2) delta_x_robust_init[:, :, height_2_start:height_2_start + height_tile_2, width_2_start:width_2_start + height_tile_2, ] = 0.0 delta_x_robust_init[:, :, height_start:height_start + height_tile, width_start:width_start + height_tile] = delta_new else: delta_new /= np.sqrt( np.sum(delta_new**2, axis=(1, 2), keepdims=True) ) * np.sqrt(diff_norm / channels + w_norm**2) delta_x_robust_init[:, height_2_start:height_2_start + height_tile_2, width_2_start:width_2_start + height_tile_2, :, ] = 0.0 delta_x_robust_init[:, height_start:height_start + height_tile, width_start:width_start + height_tile, :] = delta_new x_robust_new = np.clip( x_init + self.eps * delta_x_robust_init / np.sqrt( np.sum(delta_x_robust_init**2, axis=(1, 2, 3), keepdims=True)), self.estimator.clip_values[0], self.estimator.clip_values[1], ) sample_logits_diff_new = self._get_logits_diff( x_robust_new, y_robust) logits_diff_improved = (sample_logits_diff_new - sample_logits_diff_init) < 0.0 x_robust[logits_diff_improved] = x_robust_new[ logits_diff_improved] x_adv[sample_is_robust] = x_robust return x_adv
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ x_adv = x #.astype(NUMPY_DTYPE) if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values else: clip_min, clip_max = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.classifier.predict(x, logits=False)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # The optimization is performed in tanh space to keep the adversarial images bounded in correct range x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize binary search: c = self.initial_const * np.ones(x_batch.shape[0]) c_lower_bound = np.zeros(x_batch.shape[0]) c_double = (np.ones(x_batch.shape[0]) > 0) # Initialize placeholders for best l2 distance and attack found so far best_l2dist = np.inf * np.ones(x_batch.shape[0]) best_x_adv_batch = x_batch.copy() for bss in range(self.binary_search_steps): logger.debug('Binary search step %i out of %i (c_mean==%f)', bss, self.binary_search_steps, np.mean(c)) nb_active = int(np.sum(c < self._c_upper_bound)) logger.debug( 'Number of samples with c < _c_upper_bound: %i out of %i', nb_active, x_batch.shape[0]) if nb_active == 0: break lr = self.learning_rate * np.ones(x_batch.shape[0]) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() z, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c) attack_success = (loss - l2dist <= 0) overall_attack_success = attack_success for it in range(self.max_iter): logger.debug('Iteration step %i out of %i', it, self.max_iter) logger.debug('Average Loss: %f', np.mean(loss)) logger.debug('Average L2Dist: %f', np.mean(l2dist)) logger.debug('Average Margin Loss: %f', np.mean(loss - l2dist)) logger.debug( 'Current number of succeeded attacks: %i out of %i', int(np.sum(attack_success)), len(attack_success)) improved_adv = attack_success & (l2dist < best_l2dist) logger.debug('Number of improved L2 distances: %i', int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[ improved_adv] active = (c < self._c_upper_bound) & (lr > 0) nb_active = int(np.sum(active)) logger.debug( 'Number of samples with c < _c_upper_bound and lr > 0: %i out of %i', nb_active, x_batch.shape[0]) if nb_active == 0: break # compute gradient: logger.debug('Compute loss gradient') perturbation_tanh = -self._loss_gradient( z[active], y_batch[active], x_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], c[active], clip_min, clip_max) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for h in range(self.max_halving): logger.debug('Perform halving iteration %i out of %i', h, self.max_halving) do_halving = (loss[active] >= prev_loss[active]) logger.debug('Halving to be performed on %i samples', int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = lr[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_halving] + \ lr_mult * perturbation_tanh[do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max, self._tanh_smoother) _, l2dist[active_and_do_halving], loss[ active_and_do_halving] = self._loss( x_batch[active_and_do_halving], new_x_adv_batch, y_batch[active_and_do_halving], c[active_and_do_halving]) logger.debug('New Average Loss: %f', np.mean(loss)) logger.debug('New Average L2Dist: %f', np.mean(l2dist)) logger.debug('New Average Margin Loss: %f', np.mean(loss - l2dist)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 lr[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for d in range(self.max_doubling): logger.debug('Perform doubling iteration %i out of %i', d, self.max_doubling) do_doubling = (halving[active] == 1) & ( loss[active] <= best_loss[active]) logger.debug('Doubling to be performed on %i samples', int(np.sum(do_doubling))) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling lr[active_and_do_doubling] *= 2 lr_mult = lr[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_doubling] + \ lr_mult * perturbation_tanh[do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max, self._tanh_smoother) _, l2dist[active_and_do_doubling], loss[ active_and_do_doubling] = self._loss( x_batch[active_and_do_doubling], new_x_adv_batch, y_batch[active_and_do_doubling], c[active_and_do_doubling]) logger.debug('New Average Loss: %f', np.mean(loss)) logger.debug('New Average L2Dist: %f', np.mean(l2dist)) logger.debug('New Average Margin Loss: %f', np.mean(loss - l2dist)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[halving == 1] /= 2 update_adv = (best_lr[active] > 0) logger.debug( 'Number of adversarial samples to be finally updated: %i', int(np.sum(update_adv))) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + \ best_lr_mult * perturbation_tanh[update_adv] x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min, clip_max, self._tanh_smoother) z[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv] = \ self._loss(x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], c[active_and_update_adv]) attack_success = (loss - l2dist <= 0) overall_attack_success = overall_attack_success | attack_success # Update depending on attack success: improved_adv = attack_success & (l2dist < best_l2dist) logger.debug('Number of improved L2 distances: %i', int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] c_double[overall_attack_success] = False c[overall_attack_success] = (c_lower_bound + c)[overall_attack_success] / 2 c_old = c c[~overall_attack_success & c_double] *= 2 c[~overall_attack_success & ~c_double] += (c - c_lower_bound)[~overall_attack_success & ~c_double] / 2 c_lower_bound[~overall_attack_success] = c_old[ ~overall_attack_success] x_adv[batch_index_1:batch_index_2] = best_x_adv_batch rate = 100 * compute_success(self.classifier, x, y, x_adv, self.targeted) TrackedCW.tracked_x.append( (x_adv, rate, batch_id, best_l2dist.mean())) logger.info( 'Success rate of C&W L_2 attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted)) return x_adv
def setUpClass(cls): k.set_learning_phase(1) # Get MNIST (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train, x_test, y_test = x_train[: NB_TRAIN], y_train[: NB_TRAIN], x_test[: NB_TEST], y_test[: NB_TEST] cls.mnist = (x_train, y_train), (x_test, y_test) # Keras classifier cls.classifier_k = cls._cnn_mnist_k([28, 28, 1]) cls.classifier_k.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epochs=2) scores = cls.classifier_k._model.evaluate(x_train, y_train) logger.info('[Keras, MNIST] Accuracy on training set: %.2f%%', (scores[1] * 100)) scores = cls.classifier_k._model.evaluate(x_test, y_test) logger.info('[Keras, MNIST] Accuracy on test set: %.2f%%', (scores[1] * 100)) # Create basic CNN on MNIST using TensorFlow cls.classifier_tf = cls._cnn_mnist_tf([28, 28, 1]) cls.classifier_tf.fit(x_train, y_train, nb_epochs=2, batch_size=BATCH_SIZE) scores = get_labels_np_array(cls.classifier_tf.predict(x_train)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('[TF, MNIST] Accuracy on training set: %.2f%%', (acc * 100)) scores = get_labels_np_array(cls.classifier_tf.predict(x_test)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('[TF, MNIST] Accuracy on test set: %.2f%%', (acc * 100)) # Create basic PyTorch model cls.classifier_py = cls._cnn_mnist_py() x_train, x_test = np.swapaxes(x_train, 1, 3), np.swapaxes(x_test, 1, 3) cls.classifier_py.fit(x_train, y_train, nb_epochs=2, batch_size=BATCH_SIZE) scores = get_labels_np_array(cls.classifier_py.predict(x_train)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('[PyTorch, MNIST] Accuracy on training set: %.2f%%', (acc * 100)) scores = get_labels_np_array(cls.classifier_py.predict(x_test)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('[PyTorch, MNIST] Accuracy on test set: %.2f%%', (acc * 100))
def generate(self, x, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ x_adv = x.copy() (clip_min, clip_max) = self.classifier.clip_values # Parse and save attack-specific parameters params_cpy = dict(kwargs) y = params_cpy.pop(str('y'), None) self.set_params(**params_cpy) # Assert that, if attack is targeted, y_val is provided: assert not (self.targeted and y is None) # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.classifier.predict(x, logits=False)) for j, (ex, target) in enumerate(zip(x_adv, y)): image = ex.copy() # The optimization is performed in tanh space to keep the # adversarial images bounded from clip_min and clip_max. To avoid division by zero (which occurs if # arguments of arctanh are +1 or -1), we multiply arguments with _tanh_smoother. # It appears this is what Carlini and Wagner # (2016) are alluding to in their footnote 8. However, it is not clear how their proposed trick # ("instead of scaling by 1/2 we cale by 1/2 + eps") would actually work. image_tanh = np.clip(image, clip_min, clip_max) image_tanh = (image_tanh - clip_min) / (clip_max - clip_min) image_tanh = np.arctanh( ((image_tanh * 2) - 1) * self._tanh_smoother) # Initialize binary search: c = self.initial_const c_lower_bound = 0 c_double = True # Initialize placeholders for best l2 distance and attack found so far best_l2dist = sys.float_info.max best_adv_image = image for _ in range(self.binary_search_steps): attack_success = False loss_prev = sys.float_info.max lr = self.learning_rate # Initialize perturbation in tanh space: perturbation_tanh = np.zeros(image_tanh.shape) for it in range(self.max_iter): # First transform current adversarial sample from tanh to original space: adv_image = image_tanh + perturbation_tanh adv_image = (np.tanh(adv_image) / self._tanh_smoother + 1) / 2 adv_image = adv_image * (clip_max - clip_min) + clip_min # Collect current logits, loss and l2 distance. z, l2dist, loss = self.loss(image, adv_image, target, c) last_attack_success = loss - l2dist <= 0 attack_success = attack_success or last_attack_success if last_attack_success: if l2dist < best_l2dist: best_l2dist = l2dist best_adv_image = adv_image break #elif loss >= loss_prev: # break else: if self.targeted: i_sub, i_add = np.argmax(target), np.argmax( z * (1 - target)) else: i_add, i_sub = np.argmax(target), np.argmax( z * (1 - target)) grad_l2p = self.classifier.class_gradient( np.array([adv_image]), label=i_add, logits=True)[0] grad_l2p -= self.classifier.class_gradient( np.array([adv_image]), label=i_sub, logits=True)[0] grad_l2p *= c grad_l2p += 2 * (adv_image - image) grad_l2p *= (clip_max - clip_min) grad_l2p *= (1 - np.square( np.tanh(image_tanh + perturbation_tanh))) / ( 2 * self._tanh_smoother) # Update the pertubation with decayed learning rate lr *= (1. / (1. + self.decay * it)) perturbation_tanh -= lr * grad_l2p[0] loss_prev = loss # Update binary search: if attack_success: c_double = False c = (c_lower_bound + c) / 2 else: c_old = c if c_double: c = 2 * c else: c = c + (c - c_lower_bound) / 2 c_lower_bound = c_old # Abort binary search if c exceeds upper bound: if c > self._c_upper_bound: break x_adv[j] = best_adv_image return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. :return: The adversarial examples. """ if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=True) if y is not None and self.estimator.nb_classes == 2 and y.shape[ 1] == 1: # pragma: no cover raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) x_adv = x.copy() if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs y = get_labels_np_array( self.estimator.predict( x, batch_size=self.batch_size)) # type: ignore # Get clip_min and clip_max from the classifier or infer them from data if self.estimator.clip_values is not None: self.clip_min, self.clip_max = self.estimator.clip_values else: self.clip_min, self.clip_max = np.min(x), np.max(x) # Check for square input images if (self.estimator.channels_first and x.shape[2] != x.shape[3]) or ( # pragma: no cover not self.estimator.channels_first and x.shape[1] != x.shape[2]): raise ValueError("Input images `x` have to be square.") # Create or load DCT basis image_size = x.shape[2] logger.info("Create or load DCT basis.") path = f"2d_dct_basis_{self.sub_dim}_{image_size}.npy" if os.path.exists(path): self.sub_basis = np.load(path).astype(ART_NUMPY_DTYPE) else: self.sub_basis = self._generate_2d_dct_basis( sub_dim=self.sub_dim, res=image_size).astype(ART_NUMPY_DTYPE) np.save(path, self.sub_basis) for i in trange(x.shape[0], desc="GeoDA - samples", disable=not self.verbose, position=0): x_i = x[[i]] y_i = y[[i]] # Reset number of calls self.nb_calls = 0 # Random search x_random = self._find_random_adversarial(x=x_i, y=y_i) logger.info("Random search adversarial example is adversarial: %r", self._is_adversarial(x_random, y_i)) # Binary search x_boundary = self._binary_search(x_i, y_i, x_random, tol=self.bin_search_tol) logger.info("Binary search example at boundary is adversarial: %r", self._is_adversarial(x_boundary, y_i)) grad = np.zeros_like(x_i) x_adv_i = x_i for k in trange(self.iterate, desc="GeoDA - steps", disable=not self.verbose, position=1): grad_oi, _ = self._black_grad_batch(x_boundary, self.q_opt_iter[k], self.batch_size, y_i) grad = grad_oi + grad x_adv_i = self._go_to_boundary(x_i, y_i, grad) x_adv_i = self._binary_search(x_i, y_i, x_adv_i, tol=self.bin_search_tol) x_boundary = x_adv_i x_adv_i = np.clip(x_adv_i, a_min=self.clip_min, a_max=self.clip_max) x_adv[i] = x_adv_i return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.' ) # Use model predictions as correct outputs targets = get_labels_np_array( self.classifier.predict(x, batch_size=self.batch_size)) else: targets = y adv_x_best = None rate_best = None if self.random_eps: ratio = self.eps_step / self.eps self.eps = np.round(self.norm_dist.rvs(1)[0], 10) self.eps_step = ratio * self.eps for _ in range(max(1, self.num_random_init)): adv_x = x.astype(ART_NUMPY_DTYPE) for i_max_iter in range(self.max_iter): adv_x = self._compute( adv_x, x, targets, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0) if self.num_random_init > 1: rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted, batch_size=self.batch_size) if rate_best is None or rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info( 'Success rate of attack: %.2f%%', rate_best if rate_best is not None else 100 * compute_success(self.classifier, x, y, adv_x_best, self.targeted, batch_size=self.batch_size)) return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes) # Check that `y` is provided for targeted attacks if self.targeted and y is None: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[ 1] == 1: # pragma: no cover raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv_list = [] for batch_id in trange(nb_batches, desc="ZOO", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv_list.append(res) x_adv = np.vstack(x_adv_list) # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def _test_attack(self, classifier, x_test, y_test, targeted): """ Test with SimBA :return: """ x_test_original = x_test.copy() # set the targeted label if targeted: y_target = np.zeros(10) y_target[8] = 1.0 ####### # dct # ####### df = SimBA(classifier, attack="dct", targeted=targeted) x_i = x_test_original[0][None, ...] if targeted: x_test_adv = df.generate(x_i, y=y_target.reshape(1, 10)) else: x_test_adv = df.generate(x_i) for i in range(1, len(x_test_original)): x_i = x_test_original[i][None, ...] if targeted: tmp_x_test_adv = df.generate(x_i, y=y_target.reshape(1, 10)) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) else: tmp_x_test_adv = df.generate(x_i) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0.0 == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) ###### # px # ###### df_px = SimBA(classifier, attack="px", targeted=targeted) x_i = x_test_original[0][None, ...] if targeted: x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10)) else: x_test_adv = df_px.generate(x_i) for i in range(1, len(x_test_original)): x_i = x_test_original[i][None, ...] if targeted: tmp_x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10)) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) else: tmp_x_test_adv = df_px.generate(x_i) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0.0 == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) ############# # px - diag # ############# df_px = SimBA(classifier, attack="px", targeted=targeted, order="diag") x_i = x_test_original[0][None, ...] if targeted: x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10)) else: x_test_adv = df_px.generate(x_i) for i in range(1, len(x_test_original)): x_i = x_test_original[i][None, ...] if targeted: tmp_x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10)) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) else: tmp_x_test_adv = df_px.generate(x_i) x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv]) self.assertFalse((x_test == x_test_adv).all()) self.assertFalse((0.0 == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == y_pred).all()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def _test_backend_mnist(self, classifier, x_train, y_train, x_test, y_test): x_test_original = x_test.copy() # Test BIM with np.inf norm attack = BasicIterativeMethod(classifier, eps=1.0, eps_step=0.1, batch_size=128, verbose=False) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info("Accuracy on adversarial train examples: %.2f%%", (acc * 100)) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", (acc * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) # Test eps of array type 1 eps = np.ones(shape=x_test.shape) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 2 eps = np.ones(shape=x_test.shape[1:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 3 eps = np.ones(shape=x_test.shape[2:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) # Test eps of array type 4 eps = np.ones(shape=x_test.shape[3:]) * 1.0 eps_step = np.ones_like(eps) * 0.1 attack_params = {"eps_step": eps_step, "eps": eps} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all())
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ mask = kwargs.get("mask") y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: if self.targeted: raise ValueError("Target labels `y` need to be provided for a targeted attack.") y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)).astype(np.int32) x_adv = x.astype(ART_NUMPY_DTYPE) for _ in trange(max(1, self.nb_random_init), desc="AutoPGD - restart", disable=not self.verbose): # Determine correctly predicted samples y_pred = self.estimator.predict(x_adv) if self.targeted: sample_is_robust = np.argmax(y_pred, axis=1) != np.argmax(y, axis=1) elif not self.targeted: sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) if np.sum(sample_is_robust) == 0: break x_robust = x_adv[sample_is_robust] y_robust = y[sample_is_robust] x_init = x[sample_is_robust] n = x_robust.shape[0] m = np.prod(x_robust.shape[1:]).item() random_perturbation = ( random_sphere(n, m, self.eps, self.norm).reshape(x_robust.shape).astype(ART_NUMPY_DTYPE) ) x_robust = x_robust + random_perturbation if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_robust = np.clip(x_robust, clip_min, clip_max) perturbation = projection(x_robust - x_init, self.eps, self.norm) x_robust = x_init + perturbation # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_robust.shape[0] / float(self.batch_size))), desc="AutoPGD - batch", leave=False, disable=not self.verbose, ): self.eta = 2 * self.eps_step batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size x_k = x_robust[batch_index_1:batch_index_2].astype(ART_NUMPY_DTYPE) x_init_batch = x_init[batch_index_1:batch_index_2].astype(ART_NUMPY_DTYPE) y_batch = y_robust[batch_index_1:batch_index_2] p_0 = 0 p_1 = 0.22 W = [p_0, p_1] while True: p_j_p_1 = W[-1] + max(W[-1] - W[-2] - 0.03, 0.06) if p_j_p_1 > 1: break W.append(p_j_p_1) W = [math.ceil(p * self.max_iter) for p in W] eta = self.eps_step self.count_condition_1 = 0 for k_iter in trange(self.max_iter, desc="AutoPGD - iteration", leave=False, disable=not self.verbose): # Get perturbation, use small scalar to avoid division by 0 tol = 10e-8 # Get gradient wrt loss; invert it if attack is targeted grad = self.estimator.loss_gradient(x_k, y_batch) * (1 - 2 * int(self.targeted)) # Apply norm bound if self.norm in [np.inf, "inf"]: grad = np.sign(grad) elif self.norm == 1: ind = tuple(range(1, len(x_k.shape))) grad = grad / (np.sum(np.abs(grad), axis=ind, keepdims=True) + tol) elif self.norm == 2: ind = tuple(range(1, len(x_k.shape))) grad = grad / (np.sqrt(np.sum(np.square(grad), axis=ind, keepdims=True)) + tol) assert x_k.shape == grad.shape perturbation = grad if mask is not None: perturbation = perturbation * (mask.astype(ART_NUMPY_DTYPE)) # Apply perturbation and clip z_k_p_1 = x_k + eta * perturbation if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values z_k_p_1 = np.clip(z_k_p_1, clip_min, clip_max) if k_iter == 0: x_1 = z_k_p_1 perturbation = projection(x_1 - x_init_batch, self.eps, self.norm) x_1 = x_init_batch + perturbation f_0 = self.estimator.compute_loss(x=x_k, y=y_batch, reduction="mean") f_1 = self.estimator.compute_loss(x=x_1, y=y_batch, reduction="mean") self.eta_w_j_m_1 = eta self.f_max_w_j_m_1 = f_0 if f_1 >= f_0: self.f_max = f_1 self.x_max = x_1 self.x_max_m_1 = x_init_batch self.count_condition_1 += 1 else: self.f_max = f_0 self.x_max = x_k.copy() self.x_max_m_1 = x_init_batch # Settings for next iteration k x_k_m_1 = x_k.copy() x_k = x_1 else: perturbation = projection(z_k_p_1 - x_init_batch, self.eps, self.norm) z_k_p_1 = x_init_batch + perturbation alpha = 0.75 x_k_p_1 = x_k + alpha * (z_k_p_1 - x_k) + (1 - alpha) * (x_k - x_k_m_1) if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_k_p_1 = np.clip(x_k_p_1, clip_min, clip_max) perturbation = projection(x_k_p_1 - x_init_batch, self.eps, self.norm) x_k_p_1 = x_init_batch + perturbation f_k_p_1 = self.estimator.compute_loss(x=x_k_p_1, y=y_batch, reduction="mean") if f_k_p_1 == 0.0: x_k = x_k_p_1.copy() break if (not self.targeted and f_k_p_1 > self.f_max) or (self.targeted and f_k_p_1 < self.f_max): self.count_condition_1 += 1 self.x_max = x_k_p_1 self.x_max_m_1 = x_k self.f_max = f_k_p_1 if k_iter in W: rho = 0.75 condition_1 = self.count_condition_1 < rho * (k_iter - W[W.index(k_iter) - 1]) condition_2 = self.eta_w_j_m_1 == eta and self.f_max_w_j_m_1 == self.f_max if condition_1 or condition_2: eta = eta / 2 x_k_m_1 = self.x_max_m_1 x_k = self.x_max else: x_k_m_1 = x_k x_k = x_k_p_1.copy() self.count_condition_1 = 0 self.eta_w_j_m_1 = eta self.f_max_w_j_m_1 = self.f_max else: x_k_m_1 = x_k x_k = x_k_p_1.copy() y_pred_adv_k = self.estimator.predict(x_k) if self.targeted: sample_is_not_robust_k = np.invert(np.argmax(y_pred_adv_k, axis=1) != np.argmax(y_batch, axis=1)) elif not self.targeted: sample_is_not_robust_k = np.invert(np.argmax(y_pred_adv_k, axis=1) == np.argmax(y_batch, axis=1)) x_robust[batch_index_1:batch_index_2][sample_is_not_robust_k] = x_k[sample_is_not_robust_k] x_adv[sample_is_robust] = x_robust return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ # ZOO can probably be extended to feature vectors if no zooming or resizing is applied if len(x.shape) == 2: raise ValueError( 'Feature vectors detected. The ZOO attack can only be applied to data with spatial' 'dimensions.') # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.classifier.predict(x, logits=False, batch_size=self.batch_size)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( 'Success rate of ZOO attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size)) return x_adv
def _test_backend_mnist(self, classifier): # Get MNIST (x_train, y_train), (x_test, y_test) = self.mnist # Test FGSM with np.inf norm attack = FastGradientMethod(classifier, eps=1) x_test_adv = attack.generate(x_test, **{'batch_size': 2}) x_train_adv = attack.generate(x_train, **{'batch_size': 4}) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('Accuracy on adversarial train examples: %.2f%%', (acc * 100)) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial test examples: %.2f%%', (acc * 100)) # Test minimal perturbations attack_params = {"minimal": True, "eps_step": .1, "eps_max": 1.} x_train_adv_min = attack.generate(x_train, **attack_params) x_test_adv_min = attack.generate(x_test, **attack_params) self.assertFalse((x_train_adv_min == x_train_adv).all()) self.assertFalse((x_test_adv_min == x_test_adv).all()) self.assertFalse((x_train == x_train_adv_min).all()) self.assertFalse((x_test == x_test_adv_min).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv_min)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv_min)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info( 'Accuracy on adversarial train examples with minimal perturbation: %.2f%%', (acc * 100)) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy on adversarial test examples with minimal perturbation: %.2f%%', (acc * 100)) # L_1 norm attack = FastGradientMethod(classifier, eps=1, norm=1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy on adversarial test examples with L1 norm: %.2f%%', (acc * 100)) # L_2 norm attack = FastGradientMethod(classifier, eps=1, norm=2) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy on adversarial test examples with L2 norm: %.2f%%', (acc * 100))
def test_9_keras_mnist(self): x_test_original = self.x_test_mnist.copy() # Keras classifier classifier = get_image_classifier_kr() scores = classifier._model.evaluate(self.x_train_mnist, self.y_train_mnist) logger.info("[Keras, MNIST] Accuracy on training set: %.2f%%", (scores[1] * 100)) scores = classifier._model.evaluate(self.x_test_mnist, self.y_test_mnist) logger.info("[Keras, MNIST] Accuracy on test set: %.2f%%", (scores[1] * 100)) # targeted # Generate random target classes nb_classes = np.unique(np.argmax(self.y_test_mnist, axis=1)).shape[0] targets = np.random.randint(nb_classes, size=self.n_test) while (targets == np.argmax(self.y_test_mnist, axis=1)).any(): targets = np.random.randint(nb_classes, size=self.n_test) # Perform attack df = SaliencyMapMethod(classifier, theta=1, batch_size=100, verbose=False) x_test_adv = df.generate(self.x_test_mnist, y=to_categorical(targets, nb_classes)) self.assertFalse((self.x_test_mnist == x_test_adv).all()) self.assertFalse((0.0 == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_test_mnist == y_pred).all()) accuracy = np.sum( np.argmax(y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) / self.n_test logger.info("Accuracy on adversarial examples: %.2f%%", (accuracy * 100)) # untargeted df = SaliencyMapMethod(classifier, theta=1, batch_size=100, verbose=False) x_test_adv = df.generate(self.x_test_mnist) self.assertFalse((self.x_test_mnist == x_test_adv).all()) self.assertFalse((0.0 == x_test_adv).all()) y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_test_mnist == y_pred).all()) accuracy = np.sum( np.argmax(y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) / self.n_test logger.info("Accuracy on adversarial examples: %.2f%%", (accuracy * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(x_test_original - self.x_test_mnist))), 0.0, delta=0.00001)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ mask = self._get_mask(x, **kwargs) # Ensure eps is broadcastable self._check_compatibility_input_and_eps(x=x) if isinstance(self.estimator, ClassifierMixin): if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs logger.info( "Using model predictions as correct labels for FGM.") y_array = get_labels_np_array( self.estimator.predict( x, batch_size=self.batch_size)) # type: ignore else: y_array = y if self.estimator.nb_classes > 2: y_array = y_array / np.sum(y_array, axis=1, keepdims=True) # Return adversarial examples computed with minimal perturbation if option is active adv_x_best = x if self.minimal: logger.info("Performing minimal perturbation FGM.") adv_x_best = self._minimal_perturbation(x, y_array, mask) rate_best = 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x_best, self.targeted, batch_size=self.batch_size, # type: ignore ) else: rate_best = 0.0 for _ in range(max(1, self.num_random_init)): adv_x = self._compute( x, x, y_array, mask, self.eps, self.eps, self._project, self.num_random_init > 0, ) if self.num_random_init > 1: rate = 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x, self.targeted, batch_size=self.batch_size, # type: ignore ) if rate > rate_best: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info( "Success rate of FGM attack: %.2f%%", rate_best if rate_best is not None else 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x_best, self.targeted, batch_size=self.batch_size, ), ) else: if self.minimal: # pragma: no cover raise ValueError( "Minimal perturbation is only supported for classification." ) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs logger.info( "Using model predictions as correct labels for FGM.") y_array = self.estimator.predict(x, batch_size=self.batch_size) else: y_array = y adv_x_best = self._compute( x, x, y_array, None, self.eps, self.eps, self._project, self.num_random_init > 0, ) if self.summary_writer is not None: self.summary_writer.reset() return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values else: clip_min, clip_max = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_2", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # The optimization is performed in tanh space to keep the adversarial images bounded in correct range x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize binary search: c_current = self.initial_const * np.ones(x_batch.shape[0]) c_lower_bound = np.zeros(x_batch.shape[0]) c_double = np.ones(x_batch.shape[0]) > 0 # Initialize placeholders for best l2 distance and attack found so far best_l2dist = np.inf * np.ones(x_batch.shape[0]) best_x_adv_batch = x_batch.copy() for bss in range(self.binary_search_steps): logger.debug( "Binary search step %i out of %i (c_mean==%f)", bss, self.binary_search_steps, np.mean(c_current), ) nb_active = int(np.sum(c_current < self._c_upper_bound)) logger.debug( "Number of samples with c_current < _c_upper_bound: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() z_logits, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c_current) attack_success = loss - l2dist <= 0 overall_attack_success = attack_success for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug("Average L2Dist: %f", np.mean(l2dist)) logger.debug("Average Margin Loss: %f", np.mean(loss - l2dist)) logger.debug( "Current number of succeeded attacks: %i out of %i", int(np.sum(attack_success)), len(attack_success), ) improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[ improved_adv] active = (c_current < self._c_upper_bound) & (learning_rate > 0) nb_active = int(np.sum(active)) logger.debug( "Number of samples with c_current < _c_upper_bound and learning_rate > 0: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], c_current[active], clip_min, clip_max, ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug( "Halving to be performed on %i samples", int(np.sum(do_halving)), ) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv1 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = x_adv1 + lr_mult * perturbation_tanh[ do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_halving], loss[ active_and_do_halving] = self._loss( x_batch[active_and_do_halving], new_x_adv_batch, y_batch[active_and_do_halving], c_current[active_and_do_halving], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[ loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & ( loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv2 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv2 + lr_mult * perturbation_tanh[ do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_doubling], loss[ active_and_do_doubling] = self._loss( x_batch[active_and_do_doubling], new_x_adv_batch, y_batch[active_and_do_doubling], c_current[active_and_do_doubling], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[ loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv4 = x_adv_batch_tanh[active_and_update_adv] best_lr1 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[ active_and_update_adv] = x_adv4 + best_lr1 x_adv6 = x_adv_batch_tanh[active_and_update_adv] x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv6, clip_min, clip_max) ( z_logits[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv], ) = self._loss( x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], c_current[active_and_update_adv], ) attack_success = loss - l2dist <= 0 overall_attack_success = overall_attack_success | attack_success # Update depending on attack success: improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] c_double[overall_attack_success] = False c_current[overall_attack_success] = ( c_lower_bound + c_current)[overall_attack_success] / 2 c_old = c_current c_current[~overall_attack_success & c_double] *= 2 c_current1 = (c_current - c_lower_bound)[~overall_attack_success & ~c_double] c_current[~overall_attack_success & ~c_double] += c_current1 / 2 c_lower_bound[~overall_attack_success] = c_old[ ~overall_attack_success] x_adv[batch_index_1:batch_index_2] = best_x_adv_batch logger.info( "Success rate of C&W L_2 attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param cost_matrix: A non-negative cost matrix. :type cost_matrix: `np.ndarray` :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.copy().astype(ART_NUMPY_DTYPE) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs targets = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) else: targets = y # Compute the cost matrix if needed cost_matrix = kwargs.get("cost_matrix") if cost_matrix is None: cost_matrix = self._compute_cost_matrix(self.p, self.kernel_size) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="Wasserstein", disable=not self.verbose): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] batch_labels = targets[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( batch, batch_labels, cost_matrix) logger.info( "Success rate of attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min_per_pixel, clip_max_per_pixel = self.estimator.clip_values else: clip_min_per_pixel, clip_max_per_pixel = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_inf", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # Determine values for later clipping clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel, clip_max_per_pixel) clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel, clip_max_per_pixel) # The optimization is performed in tanh space to keep the # adversarial images bounded from clip_min and clip_max. x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() # Initialize optimization: z_logits, loss = self._loss(x_adv_batch, y_batch) attack_success = loss <= 0 learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug( "Successful attack samples: %i out of %i", int(np.sum(attack_success)), x_batch.shape[0], ) # only continue optimization for those samples where attack hasn't succeeded yet: active = ~attack_success if np.sum(active) == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], clip_min[active], clip_max[active], ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug("Halving to be performed on %i samples", int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] adv_10 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = adv_10 + lr_mult * perturbation_tanh[ do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_halving], clip_max[active_and_do_halving], ) _, loss[active_and_do_halving] = self._loss( new_x_adv_batch, y_batch[active_and_do_halving]) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("Loss: %s", str(loss)) logger.debug("Prev_loss: %s", str(prev_loss)) logger.debug("Best_loss: %s", str(best_loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & (loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv15 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv15 + lr_mult * perturbation_tanh[ do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_doubling], clip_max[active_and_do_doubling], ) _, loss[active_and_do_doubling] = self._loss( new_x_adv_batch, y_batch[active_and_do_doubling]) logger.debug("New Average Loss: %f", np.mean(loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] best_13 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[ active_and_update_adv] + best_13 x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min[active_and_update_adv], clip_max[active_and_update_adv], ) ( z_logits[active_and_update_adv], loss[active_and_update_adv], ) = self._loss( x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], ) attack_success = loss <= 0 # Update depending on attack success: x_adv_batch[~attack_success] = x_batch[~attack_success] x_adv[batch_index_1:batch_index_2] = x_adv_batch logger.info( "Success rate of C&W L_inf attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ x_adv = x.astype(NUMPY_DTYPE) # Parse and save attack-specific parameters params_cpy = dict(kwargs) y = params_cpy.pop(str('y'), None) self.set_params(**params_cpy) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self._predict(x, logits=False)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] (clip_min_per_pixel, clip_max_per_pixel) = self.classifier.clip_values clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel, clip_max_per_pixel) clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel, clip_max_per_pixel) # The optimization is performed in tanh space to keep the # adversarial images bounded from clip_min and clip_max. x_batch_tanh = self._original_to_tanh(x_batch, clip_min, clip_max) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() # Initialize optimization: z, loss = self._loss(x_adv_batch, y_batch) attack_success = (loss <= 0) lr = self.learning_rate * np.ones(x_batch.shape[0]) for it in range(self.max_iter): logger.debug('Iteration step %i out of %i', it, self.max_iter) logger.debug('Average Loss: %f', np.mean(loss)) logger.debug('Successful attack samples: %i out of %i', int(np.sum(attack_success)), x_batch.shape[0]) # only continue optimization for those samples where attack hasn't succeeded yet: active = ~attack_success if np.sum(active) == 0: break # compute gradient: logger.debug('Compute loss gradient') perturbation_tanh = -self._gradient_of_loss( z[active], y_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], clip_min[active], clip_max[active]) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for h in range(self.max_halving): logger.debug('Perform halving iteration %i out of %i', h, self.max_halving) do_halving = (loss[active] >= prev_loss[active]) logger.debug('Halving to be performed on %i samples', int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = lr[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_halving] + \ lr_mult * perturbation_tanh[do_halving] new_x_adv_batch = self._tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_halving], clip_max[active_and_do_halving]) _, loss[active_and_do_halving] = self._loss( new_x_adv_batch, y_batch[active_and_do_halving]) logger.debug('New Average Loss: %f', np.mean(loss)) logger.debug('Loss: %s', str(loss)) logger.debug('Prev_loss: %s', str(prev_loss)) logger.debug('Best_loss: %s', str(best_loss)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 lr[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for d in range(self.max_doubling): logger.debug('Perform doubling iteration %i out of %i', d, self.max_doubling) do_doubling = (halving[active] == 1) & (loss[active] <= best_loss[active]) logger.debug('Doubling to be performed on %i samples', int(np.sum(do_doubling))) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling lr[active_and_do_doubling] *= 2 lr_mult = lr[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_doubling] + \ lr_mult * perturbation_tanh[do_doubling] new_x_adv_batch = self._tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_doubling], clip_max[active_and_do_doubling]) _, loss[active_and_do_doubling] = self._loss( new_x_adv_batch, y_batch[active_and_do_doubling]) logger.debug('New Average Loss: %f', np.mean(loss)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[halving == 1] /= 2 update_adv = (best_lr[active] > 0) logger.debug( 'Number of adversarial samples to be finally updated: %i', int(np.sum(update_adv))) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + \ best_lr_mult * perturbation_tanh[update_adv] x_adv_batch[ active_and_update_adv] = self._tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min[active_and_update_adv], clip_max[active_and_update_adv]) z[active_and_update_adv], loss[ active_and_update_adv] = self._loss( x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv]) attack_success = (loss <= 0) # Update depending on attack success: x_adv_batch[~attack_success] = x_batch[~attack_success] x_adv[batch_index_1:batch_index_2] = x_adv_batch adv_preds = np.argmax(self._predict(x_adv), axis=1) if self.targeted: rate = np.sum(adv_preds == np.argmax(y, axis=1)) / x_adv.shape[0] else: preds = np.argmax(self._predict(x), axis=1) rate = np.sum(adv_preds != preds) / x_adv.shape[0] logger.info('Success rate of C&W attack: %.2f%%', 100 * rate) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) # Determine correctly predicted samples y_pred = self.estimator_orig.predict(x.astype(ART_NUMPY_DTYPE)) sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) # Untargeted attacks for attack in self.attacks: # Stop if all samples are misclassified if np.sum(sample_is_robust) == 0: break if attack.targeted: attack.set_params(targeted=False) x_adv, sample_is_robust = self._run_attack( x=x_adv, y=y, sample_is_robust=sample_is_robust, attack=attack, **kwargs, ) # Targeted attacks if self.targeted: # Labels for targeted attacks y_t = np.array([range(y.shape[1])] * y.shape[0]) y_idx = np.argmax(y, axis=1) y_idx = np.expand_dims(y_idx, 1) y_t = y_t[y_t != y_idx] targeted_labels = np.reshape(y_t, (y.shape[0], -1)) for attack in self.attacks: if attack.targeted is not None: if not attack.targeted: attack.set_params(targeted=True) for i in range(self.estimator.nb_classes - 1): # Stop if all samples are misclassified if np.sum(sample_is_robust) == 0: break target = check_and_transform_label_format(targeted_labels[:, i], self.estimator.nb_classes) x_adv, sample_is_robust = self._run_attack( x=x_adv, y=target, sample_is_robust=sample_is_robust, attack=attack, **kwargs, ) return x_adv
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: The labels for the data `x`. Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. Labels should be one-hot-encoded. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ from art.utils import compute_success, get_labels_np_array, projection if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.' ) # Use model predictions as correct outputs targets = get_labels_np_array(self.classifier.predict(x)) else: targets = y adv_x_best = None rate_best = 0.0 for i_random_init in range(max(1, self.num_random_init)): adv_x = x #.astype(NUMPY_DTYPE) noise = np.zeros_like(x) for i_max_iter in range(self.max_iter): # x, x_init, y, eps, eps_step, project, random_init adv_x = self._compute( adv_x, x, targets, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0) # if self._project: # noise = projection(adv_x - x, self.eps, self.norm) # adv_x = x + noise rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted) #logger.info('Success rate of attack step: %.2f%%', rate) noise_norm = 0 if self.norm == np.inf: noise_norm = np.sign(noise) elif self.norm == 1: ind = tuple(range(1, len(noise.shape))) noise_norm = np.sum(np.abs(noise), axis=ind, keepdims=True) elif self.norm == 2: ind = tuple(range(1, len(noise.shape))) noise_norm = np.sqrt( np.sum(np.square(noise), axis=ind, keepdims=True)) TrackedPGD.tracked_x.append( (adv_x, rate, i_max_iter, noise_norm)) if rate >= 100: break rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted) if rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x if rate >= 100: break logger.info('Success rate of attack: %.2f%%', rate_best) return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. This requires a lot of memory, therefore it accepts only a single samples as input, e.g. a batch of size 1. :param x: An array of a single original input sample. :param y: An array of a single target label. :return: An array with the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) logger.info("Using model predictions as correct labels for FGM.") y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) else: self.targeted = True if x.shape[0] > 1 or y.shape[0] > 1: raise ValueError( "This attack only accepts a single sample as input.") if x.ndim != 4: raise ValueError( "Unrecognized input dimension. Shadow Attack can only be applied to image data." ) x = x.astype(ART_NUMPY_DTYPE) x_batch = np.repeat(x, repeats=self.batch_size, axis=0).astype(ART_NUMPY_DTYPE) x_batch = x_batch + np.random.normal( scale=self.sigma, size=x_batch.shape).astype(ART_NUMPY_DTYPE) y_batch = np.repeat(y, repeats=self.batch_size, axis=0) perturbation = ( np.random.uniform(low=self.estimator.clip_values[0], high=self.estimator.clip_values[1], size=x.shape).astype(ART_NUMPY_DTYPE) - (self.estimator.clip_values[1] - self.estimator.clip_values[0]) / 2) for _ in trange(self.nb_steps, desc="Shadow attack", disable=not self.verbose): gradients_ce = np.mean( self.estimator.loss_gradient( x=x_batch + perturbation, y=y_batch, sampling=False) * (1 - 2 * int(self.targeted)), axis=0, keepdims=True, ) gradients = gradients_ce - self._get_regularisation_loss_gradients( perturbation) perturbation += self.learning_rate * gradients x_p = x + perturbation x_adv = np.clip( x_p, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1]).astype(ART_NUMPY_DTYPE) return x_adv